diff --git a/.gitignore b/.gitignore
index 9db2912c07bc2d6abb01c322a25519ac0ff158fa..ed131bdbbad6bd4dad500fa29f40a29fddeb7593 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,6 +35,7 @@
 
 build/
 build_fpga/
+docs/_build/
 
 .idea/
 
diff --git a/.travis.yml b/.travis.yml
index c902afef91b816390170f1b7e1c8e4b07c7b0645..bee77d08304881c718483b88e1ea7e55228483e2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,7 @@
 language: cpp
 cache: ccache
 sudo: required
-dist: trusty
+dist: xenial
 
 os:
   - linux
@@ -18,7 +18,7 @@ addons:
       - clang-format-3.8
         
 before_install:
-  - sudo pip install cpplint pre-commit
+  - sudo pip install cpplint pre-commit==1.10.3
   - sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format
   # Download and install recent cmake
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77a94bea1efcdafaa67b4c078bfb0a756f7b1cec..e3f7a211d70920aa74765b976af6939d55a328ab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
+lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
 lite_option(LITE_WITH_FPGA   "Enable FPGA support in lite" OFF)
@@ -73,8 +74,8 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
 lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
 lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
-lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF IF NOT LITE_WITH_ARM)
-
+lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)
+lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." ON)
 
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 if(ANDROID OR IOS OR ARMLINUX)
@@ -169,6 +170,10 @@ endif()
 
 ########################################################################################
 
+if(LITE_WITH_XPU)
+    include(xpu)
+endif()
+
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm
@@ -188,10 +193,9 @@ if(LITE_WITH_CUDA)
   include(cuda)
 endif()
 
-if(LITE_WITH_XPU)
-  include(xpu)
+if(LITE_WITH_BM)
+  include(bm)
 endif()
-
 include(generic)            # simplify cmake module
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
diff --git a/README.md b/README.md
index 23974beee9a8af5ee7e2c454575efff2e3d96ee2..22b84888294b5ef60c3d91d7a7909aef8f601d81 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ Framework compatibility: In addition to models trained on PaddlePaddle, those tr
 
 Paddle Lite is designed to support a wide range of hardwares and devices, and it enables mixed execution of a single model on multiple devices, optimization on various phases, and leight-weighted applications on devices.
 
-![img](https://github.com/Superjomn/_tmp_images/raw/master/images/paddle-lite-architecture.png)
+![img](https://user-images.githubusercontent.com/45189361/70908123-6ce4fd00-2045-11ea-97e1-ad08446c5c86.png)
 
 As is shown in the figure above, analysis phase includes Machine IR module, and it enables optimizations like Op fusion and redundant computation pruning. Besides, excecution phase only involves Kernal exevution, so it can be deployed on its own to ensure maximized light-weighted deployment.
 
diff --git a/README_cn.md b/README_cn.md
index 99d38c47ffbbaa3b8593801701e3528167899f97..11d3967fe8ce88826ca982b71d96268c1a7e5c3a 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -34,7 +34,7 @@ Paddle Lite为Paddle-Mobile的升级版，定位支持包括手机移动端在
 
 PaddleLite 的架构设计着重考虑了对多硬件和平台的支持，并且强化了多个硬件在一个模型中混合执行的能力，多个层面的性能优化处理，以及对端侧应用的轻量化设计。
 
-![](https://github.com/Superjomn/_tmp_images/raw/master/images/paddle-lite-architecture.png)
+![](https://user-images.githubusercontent.com/45189361/70908123-6ce4fd00-2045-11ea-97e1-ad08446c5c86.png)
 
 其中，Analysis Phase 包括了 MIR(Machine IR) 相关模块，能够对原有的模型的计算图针对具体的硬件列表进行算子融合、计算裁剪 在内的多种优化。Execution Phase 只涉及到Kernel 的执行，且可以单独部署，以支持极致的轻量级部署。
 
diff --git a/cmake/bm.cmake b/cmake/bm.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..3a3abb5966172ba00227e9fac7fabfe55bac7737
--- /dev/null
+++ b/cmake/bm.cmake
@@ -0,0 +1,80 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_BM)
+  return()
+endif()
+
+if(NOT DEFINED BM_SDK_ROOT)
+    set(BM_SDK_ROOT $ENV{BM_SDK_ROOT})
+    if(NOT BM_SDK_ROOT)
+        message(FATAL_ERROR "Must set BM_SDK_ROOT or env BM_SDK_ROOT when LITE_WITH_BM=ON")
+    endif()
+endif()
+
+message(STATUS "BM_SDK_ROOT: ${BM_SDK_ROOT}")
+find_path(BM_SDK_INC NAMES bmruntime_interface.h
+  PATHS ${BM_SDK_ROOT}/include/bmruntime NO_DEFAULT_PATH)
+if(NOT BM_SDK_INC)
+  message(FATAL_ERROR "Can not find bmruntime_interface.h in ${BM_SDK_ROOT}/include")
+endif()
+
+include_directories("${BM_SDK_ROOT}/include/bmruntime")
+include_directories("${BM_SDK_ROOT}/include/bmlib")
+include_directories("${BM_SDK_ROOT}/include/bmcompiler")
+include_directories("${BM_SDK_ROOT}/include/bmcpu")
+include_directories("${BM_SDK_ROOT}/include/bmlog")
+
+find_library(BM_SDK_RT_LIB NAMES bmrt
+  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+if(NOT BM_SDK_RT_LIB)
+  message(FATAL_ERROR "Can not find bmrt Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmrt Library: ${BM_SDK_RT_LIB}")
+  add_library(bmrt SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmrt PROPERTY IMPORTED_LOCATION ${BM_SDK_RT_LIB})
+endif()
+
+find_library(BM_SDK_BM_LIB NAMES bmlib
+  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+if(NOT BM_SDK_BM_LIB)
+  message(FATAL_ERROR "Can not find bmlib Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmlib Library: ${BM_SDK_BM_LIB}")
+  add_library(bmlib SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmlib PROPERTY IMPORTED_LOCATION ${BM_SDK_BM_LIB})
+endif()
+
+find_library(BM_SDK_COMPILER_LIB NAMES bmcompiler
+  PATHS ${BM_SDK_ROOT}/lib/bmcompiler)
+if(NOT BM_SDK_COMPILER_LIB)
+  message(FATAL_ERROR "Can not find bmcompiler Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmcompiler Library: ${BM_SDK_COMPILER_LIB}")
+  add_library(bmcompiler SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmcompiler PROPERTY IMPORTED_LOCATION ${BM_SDK_COMPILER_LIB})
+endif()
+
+find_library(BM_SDK_CPU_LIB NAMES bmcpu
+  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+if(NOT BM_SDK_CPU_LIB)
+  message(FATAL_ERROR "Can not find bmcpu Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmcpu Library: ${BM_SDK_CPU_LIB}")
+  add_library(bmcpu SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmcpu PROPERTY IMPORTED_LOCATION ${BM_SDK_CPU_LIB})
+endif()
+
+set(bm_runtime_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm runtime libs")
+set(bm_builder_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm builder libs")
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index bc055d3186c6bfd77ff6a5e9f979af5082fa34e3..752b22461d9d1c36b3ca6a0bfe472a5dcc3ab976 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -143,6 +143,10 @@ if (LITE_WITH_FPGA)
 add_definitions("-DLITE_WITH_FPGA")
 endif()
 
+if (LITE_WITH_BM)
+add_definitions("-DLITE_WITH_BM")
+endif()
+
 if (LITE_WITH_PROFILE)
     add_definitions("-DLITE_WITH_PROFILE")
     if (LITE_WITH_PRECISION_PROFILE)
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
index 76f62765aff791594123d689341b0876b3d0184d..0597ef0cc4ba4c0bcec172c767d66d0f362e1459 100644
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -120,6 +120,7 @@
 #
 
 ## Lite settings
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto")
 if (ARM_TARGET_OS STREQUAL "ios")
   set(PLATFORM "OS")
 elseif(ARM_TARGET_OS STREQUAL "ios64")
diff --git a/cmake/cross_compiling/npu.cmake b/cmake/cross_compiling/npu.cmake
index 25aa4d2bc8c1c145e7a103c9164e1c9e231a8f9e..c22bb1db4fbf8a7370ff3e7c9aca40cc94d550a2 100644
--- a/cmake/cross_compiling/npu.cmake
+++ b/cmake/cross_compiling/npu.cmake
@@ -30,7 +30,7 @@ if(NOT NPU_DDK_INC)
   message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include")
 endif()
 
-include_directories("${NPU_DDK_ROOT}")
+include_directories("${NPU_DDK_ROOT}/include")
 
 set(NPU_SUB_LIB_PATH "lib64")
 if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake
index 88ac3e101a686cb49ef5a4c3b1879c15b8f7b57b..7466b3e6d438277ad31020f76665bf689df436f5 100644
--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -63,7 +63,7 @@ if (LITE_ON_TINY_PUBLISH)
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
     endif()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto -fvisibility=hidden -fvisibility-inlines-hidden -fdata-sections -ffunction-sections")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
     check_linker_flag(-Wl,--gc-sections)
 endif()
 
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 9ff908a4c87d55e87468a06ae0e6085ac165a1b1..cfbda63f6d784a55803e3d3a44b9ec6a987bd964 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -174,15 +174,44 @@ if(NOT WITH_DSO)
     endif(WIN32)
 endif(NOT WITH_DSO)
 
-get_filename_component(CUDA_LIB_PATH ${CUDA_curand_LIBRARY} DIRECTORY)
-function(import_static_library alias path)
-    add_library(${alias} STATIC IMPORTED GLOBAL)
-    set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${path})
+function(add_cuda_lib TARGET_NAME)
+  set(options STATIC SHARED)
+  set(oneValueArgs "NAME")
+  set(multiValueArgs "PATHS")
+  cmake_parse_arguments(add_cuda_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  unset(ABS_PATH CACHE)
+  if (NOT add_cuda_lib_PATHS)
+      set(add_cuda_lib_PATHS CUDNN_CHECK_LIBRARY_DIRS)
+  endif()
+  find_library(ABS_PATH NAMES ${add_cuda_lib_NAME} PATHS ${${add_cuda_lib_PATHS}} NO_DEFAULT_PATH)
+  add_library(${TARGET_NAME} SHARED IMPORTED GLOBAL)
+  set_property(TARGET ${TARGET_NAME} PROPERTY IMPORTED_LOCATION ${ABS_PATH})
+  set(CUDA_MODULES ${CUDA_MODULES} ${TARGET_NAME} PARENT_SCOPE)
+  if (NOT ABS_PATH)
+    message(FATAL_ERROR "Can not find CUDA library: ${add_cuda_lib_NAME}")
+  endif()
 endfunction()
-import_static_library(cudart_static ${CUDA_LIB_PATH}/libcudart_static.a)
-import_static_library(cublas_static ${CUDA_LIB_PATH}/libcublas_static.a)
-import_static_library(curand_static ${CUDA_LIB_PATH}/libcurand_static.a)
-import_static_library(culibos_static ${CUDA_LIB_PATH}/libculibos.a)
+
+if(LITE_WITH_STATIC_CUDA)
+  message(STATUS "Static link CUDA toolkit.")
+  add_cuda_lib(cudart_static STATIC NAME libcudart_static.a)
+  add_cuda_lib(cublas_static STATIC NAME libcublas_static.a)
+  add_cuda_lib(curand_static STATIC NAME libcurand_static.a)
+  add_cuda_lib(culibos_static STATIC NAME libculibos.a)
+  if(NOT ${CUDA_VERSION} LESS 10.1)
+    add_cuda_lib(cublasLt_static STATIC NAME libcublasLt_static.a)
+  endif()
+  set_property(GLOBAL PROPERTY CUDA_MODULES cudnn_static ${CUDA_MODULES})
+else()
+  message(STATUS "Dynamic Link CUDA toolkit.")
+  add_cuda_lib(cudart SHARED NAME libcudart.so)
+  add_cuda_lib(cublas SHARED NAME libcublas.so)
+  add_cuda_lib(curand SHARED NAME libcurand.so)
+  if(NOT ${CUDA_VERSION} LESS 10.1)
+    add_cuda_lib(cublasLt SHARED NAME libcublasLt.so)
+  endif()
+  set_property(GLOBAL PROPERTY CUDA_MODULES cudnn ${CUDA_MODULES})
+endif()
 
 # setting nvcc arch flags
 select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index c0cb4ccea67cd493a30a6be43ee6ee48f70c36bf..d1386a6c7db08d140648106479a4e37947255c80 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -32,9 +32,9 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
     /usr/lib
-	${CUDA_TOOLKIT_ROOT_DIR}
-	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
-	)
+    ${CUDA_TOOLKIT_ROOT_DIR}
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
 
 if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0))
     find_library(CUBLAS_LIBRARY  NAMES libcublas.so PATHS ${CUDNN_CHECK_LIBRARY_DIRS} NO_DEFAULT_PATH)
@@ -69,9 +69,15 @@ if(CUDNN_FOUND)
     file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
 
     get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
-    add_library(cudnn_static STATIC IMPORTED GLOBAL)
-    set_property(TARGET cudnn_static PROPERTY IMPORTED_LOCATION
+    if(LITE_WITH_STATIC_CUDA)
+        add_library(cudnn_static STATIC IMPORTED GLOBAL)
+        set_property(TARGET cudnn_static PROPERTY IMPORTED_LOCATION
                "${CUDNN_LIB_PATH}/libcudnn_static.a")
+    else()
+        add_library(cudnn SHARED IMPORTED GLOBAL)
+        set_property(TARGET cudnn PROPERTY IMPORTED_LOCATION
+               "${CUDNN_LIB_PATH}/libcudnn.so")   
+    endif(LITE_WITH_STATIC_CUDA)
 
     string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
         CUDNN_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index bd0d117a633824d93c403b8167ff49505160069b..599e7bba7eaf12da7506ce44e706bd9f50ec6998 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -1,5 +1,6 @@
 INCLUDE(ExternalProject)
 
+SET(EIGEN_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/eigen3)
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
@@ -16,9 +17,12 @@ if(WITH_AMD_GPU)
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
+        GIT_TAG
+        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fhipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip
+        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
+        DOWNLOAD_NO_PROGRESS  1
         PREFIX          ${EIGEN_SOURCE_DIR}
+        DOWNLOAD_NAME   "hipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip"
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
@@ -29,12 +33,14 @@ else()
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
+        GIT_TAG
+        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
+        DOWNLOAD_NO_PROGRESS  1
         PREFIX          ${EIGEN_SOURCE_DIR}
-        DOWNLOAD_NAME   "eigen"
+        DOWNLOAD_NAME   "eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip"
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index 1d61154c0d45dea795902d6544deb796693db263..5166b494c489e25c970c7dbfe72fa1404302009f 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -20,6 +20,7 @@ endif()
 
 include(ExternalProject)
 
+SET(XBYAK_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xbyak)
 set(XBYAK_PROJECT       extern_xbyak)
 set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
 set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
@@ -38,8 +39,11 @@ ExternalProject_Add(
     ${XBYAK_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ""
-    GIT_REPOSITORY      "https://github.com/herumi/xbyak.git"
     GIT_TAG             "v5.661"  # Jul 26th
+    URL                 http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fxbyak-5.66.zip
+    DOWNLOAD_DIR        ${XBYAK_SOURCECODE_DIR}
+    DOWNLOAD_NAME   "xbyak-5.66.zip"
+    DOWNLOAD_NO_PROGRESS 1
     PREFIX              ${XBYAK_PREFIX_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index 23b1e02108642df561948a6faa3152effb7ca932..fdc20351e8bcdf5fe8e95db3516f4c6f607611db 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -1,5 +1,6 @@
 INCLUDE(ExternalProject)
 
+SET(XXHASH_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xxhash)
 set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
 set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
 set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
@@ -18,10 +19,12 @@ if(WIN32)
   ExternalProject_Add(
           extern_xxhash
           ${EXTERNAL_PROJECT_LOG_ARGS}
-          GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
           GIT_TAG         "v0.6.5"
+          URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
+          DOWNLOAD_DIR          ${XXHASH_SOURCECODE_DIR}
+          DOWNLOAD_NAME   "xxHash-0.6.5.zip"
+          DOWNLOAD_NO_PROGRESS  1
           PREFIX          ${XXHASH_SOURCE_DIR}
-          DOWNLOAD_NAME   "xxhash"
           UPDATE_COMMAND  ""
           BUILD_IN_SOURCE 1
           PATCH_COMMAND
@@ -41,10 +44,12 @@ else()
   ExternalProject_Add(
       extern_xxhash
       ${EXTERNAL_PROJECT_LOG_ARGS}
-      GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
       GIT_TAG         "v0.6.5"
+      URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
+      DOWNLOAD_DIR          ${XXHASH_SOURCECODE_DIR}
+      DOWNLOAD_NO_PROGRESS  1
       PREFIX          ${XXHASH_SOURCE_DIR}
-      DOWNLOAD_NAME   "xxhash"
+      DOWNLOAD_NAME   "xxHash-0.6.5.zip"
       UPDATE_COMMAND  ""
       CONFIGURE_COMMAND ""
       BUILD_IN_SOURCE 1
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index a095eea6d1cce9ba09ee631a50b8029e769f6d37..fd40fa437b52ff33089b55c6cfb7df6604a0530d 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
   cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(deps ${lite_deps_DEPS})
@@ -44,7 +44,7 @@ function (lite_deps TARGET)
       set(deps ${deps} ${var})
     endforeach(var)
     if(LITE_WITH_CV)
-      foreach(var ${lite_cv_deps})
+      foreach(var ${lite_deps_CV_DEPS})
         set(deps ${deps} ${var})
       endforeach(var)
     endif()
@@ -94,6 +94,12 @@ function (lite_deps TARGET)
     endforeach(var)
   endif()
 
+  if (LITE_WITH_BM)
+    foreach(var ${lite_deps_BM_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
   set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()
 
@@ -115,10 +121,11 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 #  LIGHT_DEPS:    LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 #  HVY_DEPS:      NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 #  EXCLUDE_COMPILE_DEPS: TARGET will not be included in lite_compile_deps if this is not None
+#  CV_DEPS:       LITE_WITH_CV
 function(lite_cc_library TARGET)
     set(options SHARED shared STATIC static MODULE module)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS XPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
       HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -128,10 +135,12 @@ function(lite_cc_library TARGET)
             X86_DEPS ${args_X86_DEPS}
             CUDA_DEPS ${args_CUDA_DEPS}
             CL_DEPS ${args_CL_DEPS}
-            NPU_DEPS ${args_NPU_DEPS}
-            XPU_DEPS ${args_XPU_DEPS}
+            BM_DEPS ${args_BM_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
+            CV_DEPS ${args_CV_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
+            NPU_DEPS ${args_NPU_DEPS}
+            XPU_DEPS ${args_XPU_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
@@ -161,8 +170,8 @@ function(lite_cc_binary TARGET)
         set(options " -g ")
     endif()
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
-      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
     set(deps "")
@@ -173,9 +182,13 @@ function(lite_cc_binary TARGET)
             CL_DEPS ${args_CL_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
+            NPU_DEPS ${args_NPU_DEPS}
+            XPU_DEPS ${args_XPU_DEPS}
+	    BM_DEPS ${args_BM_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
+            CV_DEPS ${CV_DEPS}
             )
     cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
     target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -205,8 +218,8 @@ function(lite_cc_test TARGET)
     endif()
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
-        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
         ARGS
         COMPILE_LEVEL # (basic|extra)
         )
@@ -225,9 +238,13 @@ function(lite_cc_test TARGET)
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
+              CV_DEPS ${args_CV_DEPS}
               )
     _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
     # strip binary target to reduce size
@@ -252,6 +269,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
+set(bm_kernels CACHE INTERNAL "bm kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")
 
@@ -262,12 +280,12 @@ if(LITE_BUILD_TAILOR)
   file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA)
+# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -333,6 +351,12 @@ function(add_kernel TARGET device level)
         endif()
         set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
+    if ("${device}" STREQUAL "BM")
+        if (NOT LITE_WITH_BM)
+            return()
+        endif()
+        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
     if ("${device}" STREQUAL "OPENCL")
         if (NOT LITE_WITH_OPENCL)
             return()
@@ -360,11 +384,13 @@ function(add_kernel TARGET device level)
     lite_cc_library(${TARGET} SRCS ${args_SRCS}
               DEPS ${args_DEPS}
               X86_DEPS ${args_X86_DEPS}
-              XPU_DEPS ${args_XPU_DEPS}
               CUDA_DEPS ${args_CUDA_DEPS}
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
@@ -383,7 +409,7 @@ endif()
 function(add_operator TARGET level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -409,11 +435,13 @@ function(add_operator TARGET level)
     lite_cc_library(${TARGET} SRCS ${args_SRCS}
               DEPS ${args_DEPS}
               X86_DEPS ${args_X86_DEPS}
-              XPU_DEPS ${args_XPU_DEPS}
               CUDA_DEPS ${args_CUDA_DEPS}
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
diff --git a/cmake/xpu.cmake b/cmake/xpu.cmake
index 8d99343c3041351102820cb20890031fa3f5807e..2112f6b658f5f89b20d63c957cd0b979299c350b 100644
--- a/cmake/xpu.cmake
+++ b/cmake/xpu.cmake
@@ -99,7 +99,7 @@ else()
   set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1 -D_GLIBCXX_USE_CXX11_ABI=0")
 
 set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
 set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..298ea9e213e8c4c11f0431077510d4e325733c65
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..66f9b291ba3b459a8d3a327f7a71d9bd2f7031e0
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1 @@
+请参考[PaddleLite文档开发规范](http://agroup.baidu.com/paddle-infer/md/article/2561104)。
diff --git a/docs/advanced_user_guides/add_layout.md b/docs/advanced_user_guides/add_layout.md
new file mode 100644
index 0000000000000000000000000000000000000000..11e504f93c2b1bcaefaa06c0a5f51aea0995884e
--- /dev/null
+++ b/docs/advanced_user_guides/add_layout.md
@@ -0,0 +1,184 @@
+# 如何增加Layout
+
+Paddle-Lite中Place包含了Target、Layout、Precision信息，用来注册和选择模型中的具体Kernel。下面以增加Place中的layout：`ImageDefault`、`ImageFolder`、`ImageNW`为例，讲解如何增加新Layout。
+
+根据在`lite/core/`、`lite/api`目录下以`NHWC`为关键词检索代码，发现需要分别在以下的文件中加入Layout内容：
+
+1. lite/api/paddle_place.h
+2. lite/api/paddle_place.cc
+3. lite/api/python/pybind/pybind.cc
+4. lite/core/op_registry.h
+5. lite/core/op_registry.cc
+
+## 1. lite/api/paddle_place.h
+
+在`enum class DataLayoutType`中加入对应的Layout，注意已有的Layout不能改变值，增加新Layout递增即可：
+
+```cpp
+enum class DataLayoutType : int {
+  kUnk = 0,
+  kNCHW = 1,
+  kNHWC = 3,
+  kImageDefault = 4,  // for opencl image2d
+  kImageFolder = 5,   // for opencl image2d
+  kImageNW = 6,       // for opencl image2d
+  kAny = 2,           // any data layout
+  NUM = 7,            // number of fields.
+};
+```
+
+## 2. lite/api/paddle_place.cc
+
+本文件有3处修改，注意在` DataLayoutToStr`函数中加入对应Layout的字符串名，顺序为`lite/api/paddle_place.h`中枚举值的顺序：
+
+```cpp
+// 该文件第1处
+const std::string& DataLayoutToStr(DataLayoutType layout) {
+  static const std::string datalayout2string[] = {
+      "unk", "NCHW", "any", "NHWC", "ImageDefault", "ImageFolder", "ImageNW"};
+  auto x = static_cast<int>(layout);
+  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
+  return datalayout2string[x];
+}
+
+// 该文件第2处
+const std::string& DataLayoutRepr(DataLayoutType layout) {
+  static const std::string datalayout2string[] = {"kUnk",
+                                                  "kNCHW",
+                                                  "kAny",
+                                                  "kNHWC",
+                                                  "kImageDefault",
+                                                  "kImageFolder",
+                                                  "kImageNW"};
+  auto x = static_cast<int>(layout);
+  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
+  return datalayout2string[x];
+}
+
+// 该文件第3处
+std::set<DataLayoutType> ExpandValidLayouts(DataLayoutType layout) {
+  static const std::set<DataLayoutType> valid_set({DATALAYOUT(kNCHW),
+                                                   DATALAYOUT(kAny),
+                                                   DATALAYOUT(kNHWC),
+                                                   DATALAYOUT(kImageDefault),
+                                                   DATALAYOUT(kImageFolder),
+                                                   DATALAYOUT(kImageNW)});
+  if (layout == DATALAYOUT(kAny)) {
+    return valid_set;
+  }
+  return std::set<DataLayoutType>({layout});
+}
+```
+
+## 3. lite/api/python/pybind/pybind.cc
+
+```cpp
+  // DataLayoutType
+  py::enum_<DataLayoutType>(*m, "DataLayoutType")
+      .value("NCHW", DataLayoutType::kNCHW)
+      .value("NHWC", DataLayoutType::kNHWC)
+      .value("ImageDefault", DataLayoutType::kImageDefault)
+      .value("ImageFolder", DataLayoutType::kImageFolder)
+      .value("ImageNW", DataLayoutType::kImageNW)
+      .value("Any", DataLayoutType::kAny);
+```
+
+## 4. lite/core/op_registry.h
+
+找到KernelRegister final中的`using any_kernel_registor_t =`，加入下面修改信息：
+
+```cpp
+// 找到KernelRegister final中的`using any_kernel_registor_t =`
+// 加入如下内容：
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageNW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageNW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageNW)> *,  //
+```
+
+
+## 5. lite/core/op_registry.cc
+
+该文件有2处修改：
+
+```cpp
+// 该文件第1处
+#define CREATE_KERNEL1(target__, precision__)                                \
+  switch (layout) {                                                          \
+    case DATALAYOUT(kNCHW):                                                  \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kNCHW)>(op_type);                             \
+    case DATALAYOUT(kAny):                                                   \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kAny)>(op_type);                              \
+    case DATALAYOUT(kNHWC):                                                  \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kNHWC)>(op_type);                             \
+    case DATALAYOUT(kImageDefault):                                          \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageDefault)>(op_type);                     \
+    case DATALAYOUT(kImageFolder):                                           \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageFolder)>(op_type);                      \
+    case DATALAYOUT(kImageNW):                                               \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageNW)>(op_type);                          \
+    default:                                                                 \
+      LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \
+  }
+
+// 该文件第2处
+// 找到文件中的下面的函数
+KernelRegistry::KernelRegistry()
+    : registries_(static_cast<int>(TARGET(NUM)) *
+                  static_cast<int>(PRECISION(NUM)) *
+                  static_cast<int>(DATALAYOUT(NUM)))
+
+// 在该函数中加入新增Layout的下面内容
+  INIT_FOR(kOpenCL, kFP16, kNCHW);
+  INIT_FOR(kOpenCL, kFP16, kNHWC);
+  INIT_FOR(kOpenCL, kFP16, kImageDefault);
+  INIT_FOR(kOpenCL, kFP16, kImageFolder);
+  INIT_FOR(kOpenCL, kFP16, kImageNW);
+  INIT_FOR(kOpenCL, kFloat, kImageDefault);
+  INIT_FOR(kOpenCL, kFloat, kImageFolder);
+  INIT_FOR(kOpenCL, kFloat, kImageNW);
+  INIT_FOR(kOpenCL, kAny, kImageDefault);
+  INIT_FOR(kOpenCL, kAny, kImageFolder);
+  INIT_FOR(kOpenCL, kAny, kImageNW);
+```
diff --git a/docs/advanced_user_guides/add_new_pass.md b/docs/advanced_user_guides/add_new_pass.md
new file mode 100644
index 0000000000000000000000000000000000000000..93b27cd038642c702cd213adffcc378dc852a1b3
--- /dev/null
+++ b/docs/advanced_user_guides/add_new_pass.md
@@ -0,0 +1,437 @@
+
+# 新增Pass方法
+
+本文从三个方面介绍了`Lite`中的`Pass`结构：**Pass是什么**、**Pass的实现与接口**、**Pass的一般注册流程**。最后以`Fc_fuse_pass`为例介绍了`fusion_pass`的作用与注册方法。
+
+## 前述：Pass是什么？
+
+**CxxPredictor加载模型后，在执行预测前会先优化模型。模型优化过程是通过Pass实现的。**
+具体调用关系如下：
+![图片](https://user-images.githubusercontent.com/45189361/69638690-20d21880-1096-11ea-8169-1d2c7e1a1609.png)
+
+ - `CreatePredictor(CxxConfig)`函数调用了Predictor->Build(CxxConfig)
+   - CxxPredictor的构建过程（Build）分为两步：
+     - Predictor->LoadModel()          加载模型文件到program中
+     - Predicotr->optimizer_.Run()    对Program中的原始图形结构进行优化
+          - 对图结构的优化是通过调用 `Pass->Apply(const std::unique_ptr<SSAGraph>& graph)`方法实现的。
+
+
+**每一类Pass定义了一种优化过程**，包括：原模型中的kernel选取、OP融合、冗余OP去除、子图创建、内存优化、类型推导、类型转换等。
+
+
+
+
+## Pass的实现与接口 ：Pass基类、PassManager和Pass注册
+
+### 1、Pass基类：`paddle::lite::mir::Pass`
+```c++
+class Pass {
+ public:
+  // Pass的类型，Pass按照作用的不同可以分为三种
+  enum class Kind {   //种类的作用不太清楚
+    // 1. 修改模型中的图拓扑结构的Pass
+    kProgramWise = 0,
+    // 2. 不修改图结构，修改状态的Pass
+    kStmtWise,     
+    // 3. 不修改 IR，用于搜集信息和可视化信息的Pass.
+    kDebug,
+  };
+  
+  // 主要实现函数：Apply 函数定义了 Pass 运行时执行的操作
+  virtual void Apply(const std::unique_ptr<SSAGraph>& graph) = 0;
+
+  bool is_program_pass() const { return kind_ == Kind::kProgramWise; }
+  bool is_stmt_pass() const { return kind_ == Kind::kStmtWise; }
+
+  virtual ~Pass() = default;
+
+ private:
+  const Kind kind_;  // pass 的种类
+  std::string name_; // pass 的名称
+  std::set<TargetType> bound_targets_; // 指定了Pass运行的硬件平台，模型优化过程会根据当前硬件平台是否匹配筛选Pass。
+  std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_; // 绑定的kernel
+};
+
+
+// Different kinds.
+class ProgramPass : public Pass {
+ public:
+  ProgramPass() : Pass(Kind::kProgramWise) {}
+};
+class StmtPass : public Pass {
+ public:
+  StmtPass() : Pass(Kind::kStmtWise) {}
+};
+
+class DebugPass : public Pass {
+ public:
+  DebugPass() : Pass(Kind::kDebug) {}
+};
+```
+**代码位置**：`lite/core/mir/pass.h`
+**主要类成员**：
+  `const Kind kind_` : Pass类型。pass 有三种基本基本类型 ：修改图结构的`ProgramPass`、修改状态量的`StmtPass`和Debug过程采集信息与控制可视化的`DebugPass`。  
+  `std::string name_` ：pass 的名称
+  `std::set<TargetType> bound_targets_` : Pass运行的硬件平台，optimizer.Run()优化过程会根据硬件平台选择匹配的Pass。------根据硬件平台自动选择需要的pass
+  `std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_` : Pass 绑定的kernel   (what's this used for)
+**主要接口**： 
+  `Pass::Apply(const std::unique_ptr& graph)` : Pass优化过程的具体操作，是新注册Pass需要实现的接口。输入为`SSAGraph`型指针，是对模型结构的拓扑表示。
+
+### 2、Pass管理 `paddle::lite::mir::PassManager` 
+
+```c++
+class PassManager {
+ public:
+  // 内部静态变量PassManager，用来存储使用的Pass和图优化操作
+  static PassManager& Global() {
+    static PassManager x;
+    return x;
+  }
+ 
+ // 执行所有的 Pass 
+ void Run(const std::unique_ptr<SSAGraph>& graph) {
+    for (auto& pass : passes_) {
+      LOG(INFO) << "Running MIR pass " << pass->name();
+      pass->Apply(graph);
+    }
+
+ private:
+  std::list<std::unique_ptr> passes_;  //存储所有的 Pass
+  std::map<std::string, mir::Pass*> pass_map_;    //使用map变量存储 PassName::Pass
+  
+ }
+
+```
+**代码位置**：`lite/core/mir/pass_manager.h`
+**主要类成员**：
+`std::list:unique_ptr> passes_;`  : List类型，存储了所有已注册Pass。
+`std::map<std::string, mir::Pass*> pass_map_; `  :   Map类型，存储了所有"Pass名称-Pass类"键对，用于根据名称查找Pass。
+
+**主要接口**：
+ `static PassManager& Global()` 返回PassManager全局静态变量,该变量存储了所有已注册的Pass
+` bool AddNewPass(const std::string& name, Pass* pass)` 添加新的Pass到PassManager中
+
+
+### 3、 Pass 注册 `paddle::lite::mir::PassRegistry`
+**代码位置**：`lite/core/mir/pass_registry.h`
+**主要接口**：
+`REGISTER_MIR_PASS(name__, class__)` ：宏定义函数，用于注册Pass。注册Pass过程实现的是 `PassManager::Global().AddNewPass(name__, class__)`，将新注册Pass添加到全局变量`PassManager`中。
+
+
+
+## Pass的一般注册流程与使用方法
+
+### 1. Pass 注册流程
+在`lite/core/mir`或其子目录下继承`Pass基类`，实现`Pass::Apply`接口，并使用宏`REGISTER_MIR_PASS(name__, class__)`将Pass注册到`PassManager`即完成了新Pass注册。
+
+**以新建 **`new_demo_pass`**为例**，具体流程如下：
+（1）在`lite/core/mir`路径下新建`example_pass.cc` 和 `new_demo_pass.h` 文件
+（2）在`example_pass.h` 文件中继承Pass基类（ProgramPass、StmtPass或DebugPass）定义自己的Pass类。
+```c++
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+class ExamplePass : public ProgramPass {
+  void Apply(const std::unique_ptr<SSAGraph> &graph) override {}
+   ...
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+```
+（3）在`example_pass.cc` 文件中实现`ExamplePass::Apply()`接口，并注册`ExamplePass`
+```c++
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/example_pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+void ExamplePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+    ...
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(example_pass, paddle::lite::mir::ExamplePass)
+    .BindTargets({TARGET(kARM)}); // Pass执行的目标硬件平台
+    // .BindKernel("conv2d");     //Pass绑定的 kernel
+```
+
+（4）修改`lite/core/mir/CMakeLists.txt`文件，将`example_pass.cc` 编译到`mir_passes`库中
+
+```cmake
+lite_cc_library(mir_passes
+  SRCS
+      demo_pass.cc  // 新建的Pass文件
+      ...
+      memory_optimize_pass.cc
+  DEPS mir_pass types context ${mir_fusers} ${subgraph_passes})
+```
+### 2. Pass使用流程
+
+将Pass注册到PassManager后不会自动生效。需要在`optimizer->run()` 函数中添加该Pass才会在模型优化过程中调用。
+（1）在`paddle_use_passes.h`文件中调用该Pass
+
+```cmake
+#include "paddle_lite_factory_helper.h"  // NOLINT
+    ...
+USE_MIR_PASS(new_demo_pass);  //调用 new_demo_pass
+```
+（2）要想在优化模型时调用该Pass，需要在`optimizer->run()`函数中手动添加调用。
+
+修改`lite/core/optimizer.h`文件，添加`new_demo_pass`到`Optimizer::Run()`函数；
+```c++
+ class Optimizer {
+ public:
+  void Run(...) {
+   ...
+    if (passes.empty()) {
+      RunPasses(std::vector<std::string>{
+          {"new_demo_pass"     //将新注册的Pass添加在这里
+             ...
+           }
+    ...
+ }      
+```
+（3）只有CxxPredictor才会在模型加载后根据Pass优化模型。
+```c++
+ ...
+#include "paddle_use_passes.h"   // 引用Pass优化模型
+void RunModel() {
+  // 1. 创建 CxxConfig
+  CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places(Place{TARGET(kARM), PRECISION(kFloat)});
+
+  // 2. 创建CxxPredictor,该过程包括加载模型和用Pass优化模型
+  std::shared_ptr> predictor =
+      Creat<CxxConfig>(config);
+}
+```
+
+
+
+
+## Fusion Pass的定义与注册
+
+`Fusion Pass`是一种常见图结构优化Pass，可将多个连续OP融合成单个等效OP，减少数据交换并简化图结构。Pass运行时调用`Fuser`自动查找并替换指定图结构，所以注册`FuserPass`时还需要实现对应的Fuser类。
+
+下面以`fc_fuse_pass`为例，详细说明`FusionPass`的效果和注册方法。
+
+### `fc_fuse_pass`的作用
+将相邻的`mul`算子和 `element_wise add `算子 融合成一个 `FC`  算子
+```c++
+mul(X) =  X * W 
+elementwise_add( mul(x) ) = X * W + Bias
+//----------> after fusion
+FC(X) = X * W +Bias
+```
+
+Pass 运行效果如下：
+![图片](https://user-images.githubusercontent.com/45189361/69639193-12383100-1097-11ea-9063-21f030414080.png)
+mul和elementwise_add的原有参数映射到FC的参数上：
+![图片](https://user-images.githubusercontent.com/45189361/69638836-74446680-1096-11ea-9cdc-a961fa995dfe.png)
+
+### `fc_fuse_pass`的注册方法
+#### 1、创建FcFuser
+（1）在`lite/core/mir/fusion`路径下新建`fc_fuser.cc` 和 `fc_fuser.h` 文件
+（2）在`fc_fuser.h` 文件中继承`FuseBase`定义自己的Fuser类。
+
+```c++
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class FcFuser : public FuseBase {
+ public:
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+```
+**主要接口**：
+`FuseBase::BuildPattern` ：  描述需要替换位置的图结构（pattern），Fuser运行时会自动查找并替换该pattern。
+`FuseBase::GenOpDesc` ：       创建融合后的等效Fused_op。
+`FuseBase::InsertNewNode` ：用Fused_op替换原始图结构（pattern）。
+
+对于 `FcFuser`：BuildPattern描述的Pattern是`mul+elementwise add`，GenOpDesc创建的FC_op，InsertNewNode函数的效果是用新建的`FC_op`替换模型中的`mul+elementwise add` pattern。
+
+
+（3） 在`fc_fuser.cc`文件中实现 `BuildPattern()` 、`GenOpDesc()`、`InsertNewNode() `接口
+
+下面以FcFuser为例介绍三种接口的实现：
+
+```c++
+// 1. BuildPattern函数，描述需要替换的图结构
+// FcFuser::BuildPattern() 描述了 mul + element_wise add 图结构
+void FcFuser::BuildPattern() {
+  // （1） 用OpNode描述和VarNode
+  // mul OP
+  auto* mul = OpNode("mul", "mul");
+  // mul OP 的输入和输出
+  auto* x = VarNode("x")->assert_is_op_input("mul", "X");
+  auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
+  auto* mul_out = VarNode("mul_out");
+  
+  // elementwise_add OP
+  auto* add = OpNode("add", "elementwise_add");
+  //elementwise_add 的输入
+  auto* b = VarNode("b")->assert_is_persistable_var();
+  // elementwise_add OP的输出（最终输出）
+  auto* Out = VarNode("Out");
+
+  //（2） 描述拓扑连接 （Fuse之前mul 和elementwise_add的连接）
+  std::vector<PMNode*> mul_inputs{W, x};
+  std::vector<PMNode*> add_inputs{mul_out, b};
+  mul_inputs >> *mul >> *mul_out;
+  add_inputs >> *add >> *Out;
+ 
+
+  //（3） 声明新的拓扑结构中将会被移除的节点，包括被fuse的OP和OP之间的中间变量
+  mul_out->AsIntermediate();
+  mul->AsIntermediate();
+  add->AsIntermediate();
+}
+
+
+// 2. GenOpDesc函数新建等效 Fused_op
+// FcFuser::GenOpDesc() 新建了Fc_op
+cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
+  // (1) 得到第一个OP节点的 OpDesc ，并清空输入输出信息
+  cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
+  op_desc.mutable_inputs()->clear();
+  op_desc.mutable_outputs()->clear();
+  // (2) 修改OpDesc , 将OpType设置为 "fc" (FC OP 的OP_type)，
+  op_desc.SetType("fc");
+  // (3) 设置OpDesc中的Input、Output、Attrbute。分别连接到BuildPattern（）函数中创建的VarNode
+  op_desc.SetInput("Input", {matched.at("x")->arg()->name});
+  op_desc.SetInput("W", {matched.at("W")->arg()->name});
+  op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
+  op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
+  op_desc.SetAttr(
+      "in_num_col_dims",
+      matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
+  return op_desc;
+}
+
+// 3. InsertNewNode函数用Fused OP 替换模型图中的原始 Pattern
+// FcFuser::InsertNewNode() 用Fc_OP替换原始模型图中的  " mul + element_wise add "
+void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
+  // (1) 创建FC OP的参数（OpDesc）
+  auto op_desc = GenOpDesc(matched);
+  // 创建一个 FC OP
+  auto fc_op = LiteOpRegistry::Global().Create("fc");
+  
+  // 找到原拓扑结构中的scope (作用域)和 valid_places （可支持设备类型）
+  auto mul = matched.at("mul")->stmt()->op();
+  auto* scope = mul->scope();
+  auto& valid_places = mul->valid_places();
+  
+  // (2) 将 FC OP的 scope和 valid_places设置与fuse前相同，并在图中创建该节点（node）
+  fc_op->Attach(op_desc, scope);
+  auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
+  
+  // (3) 将FC节点连接到输入输出（var_node）
+  IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("b"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
+}
+```
+
+#### 2、注册fc_fuse_pass
+
+（1）在`lite/core/mir/fusion`路径下新建`fc_fuse_pass.cc` 和 `fc_fuse_pass.h` 文件
+（2）在`fc_fuse_pass.h` 文件中，继承`ProgramPass`定义`FcFusePass`。
+
+```c++
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+class FcFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override; namespace mir namespace lite namespace paddle
+```
+（3）在`fc_fuse_pass.cc` 文件中实现`FcFusePass::Apply()`接口，并注册`FcFusePass`
+```c++
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/example_pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  fusion::FcFuser fuser;
+  fuser(graph.get());namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
+    .BindTargets({TARGET(kAny)})  // FcFusePass 可以在任何硬件平台执行
+    .BindKernel("fc");            // FcFusePass 绑定 fc_kernel
+```
+
+（4）修改`lite/core/mir/fusion/CMakeLists.txt`文件，将`fc_fuser.cc` 编译到`mir_fusers`库
+
+```cmake
+lite_cc_library(fuse_fc
+        SRCS fc_fuser.cc
+        DEPS pattern_matcher_high_api) 
+
+set(mir_fusers
+    fuse_fc
+     ... 
+    CACHE INTERNAL "fusers")
+```
+
+（5）修改`lite/core/mir/CMakeLists.txt`文件，将`fc_fuse_pass.cc` 编译到`mir_pass`库
+```cmake
+lite_cc_library(mir_passes
+  SRCS
+      fusion/fc_fuse_pass.cc
+       ...
+  DEPS mir_pass types context ${mir_fusers} ${subgraph_passes})
+```
+
+#### 3、使用 fc_fuse_pass
+
+（1） `lite/api/paddle_use_passes.h`使用`USE_LITE_PASS`宏来引入新加入的pass
+
+```c++
+USE_MIR_PASS(lite_fc_fuse_pass);
+```
+（2）  在`lite/core/optimizer.h`文件的`Optimizer::Run()`函数中添加新注册的pass
+```C++
+class Optimizer {
+ public:
+  void Run(Program&& program,
+           const std::vector<Place>& valid_places,
+           core::KernelPickFactor kernel_pick_factor,
+           const std::vector<std::string>& passes = {}) {
+           ...    
+    if (passes.empty()) {
+      RunPasses(std::vector<std::string>{
+          {"lite_fc_fuse_pass",                // the newly registered pass
+            ...
+           "argument_type_display_pass"}});
+    } else {
+      RunPasses(passes);
+    }
+    exec_scope_ = program.exec_scope();
+  }
+```
+（3） 以上修改完成后，在CreatePredictor（CxxConfig）创建CxxPredictor时，模型优化过程会调用`lite_fc_fuse_pass `，扫描`mul + element_wise add`结构并替换为等效的Fc_OP。
diff --git a/docs/advanced_user_guides/add_operation.md b/docs/advanced_user_guides/add_operation.md
new file mode 100644
index 0000000000000000000000000000000000000000..525832f8a9d7341c3124498084e05b160358b2ad
--- /dev/null
+++ b/docs/advanced_user_guides/add_operation.md
@@ -0,0 +1,189 @@
+# 新增OP的方法
+
+以下以添加argmax为例，详细说明新增op的方法。
+
+## 1. 添加OpParam 结构体以传导 Op 的输入和输出
+
+- 这里命名为 `ArgmaxParam`
+
+- 在 `paddlelite/lite/operators/op_params.h` 中添加 `ArgmaxParam` 结构体，代码如下：
+    ```c++
+    struct ArgmaxParam {
+        lite::Tensor* X{};
+        lite::Tensor* Out{};
+        int Axis{0};
+    };
+    ```
+## 2. 添加 Argmax Op 并注册
+
+- 在paddlelite/lite/operators/目录下新建argmax_op.h文件，主要代码如下：
+    ```c++
+    class ArgmaxOpLite : public OpLite {
+    public:
+        ArgmaxOpLite() {}
+        explicit ArgmaxOpLite(const std::string &op_type) : OpLite(op_type) {}
+        bool CheckShape() const override;
+        bool InferShape() const override;
+        bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+        void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+        std::string DebugString() const override { return "argmax"; }
+    private:
+        mutable ArgmaxParam param_;
+    };
+    ```
+    `ArgmaxOpLite` 继承 `OpLite` ，成员变量包括 `ArgmaxParam` 结构体，需要实现的接口包括 `CheckShape()` 、`InferShape()` 、`AttachImp()` 、`AttachKernel()` 和 `DebugString()` 函数。`AttachKernel()` 和 `DebugString() `函数较为简单，此处直接实现；
+
+- 在 `paddlelite/lite/operators/` 目录下新建argmax_op.cc文件，需要具体实现`CheckShape()`、`InferShape()`和`AttachImp()`函数。`CheckShape()`函数检查输入是否符合要求，`InferShape()`函数基于输入推断得到输出的维度，`AttachImp()`函数绑定Op的输入输出。然后在argmax_op.cc文件中注册argmax，核心代码如下：
+    ```c++
+    bool ArgmaxOpLite::CheckShape() const {
+        CHECK_OR_FALSE(param_.X);
+        CHECK_OR_FALSE(param_.Out);
+        CHECK_OR_FALSE(param_.Axis < (param_.X)->dims().size());
+        return true;
+    }
+    
+    bool ArgmaxOpLite::InferShape() const {
+        auto x_dims = param_.X->dims();
+        int x_rank = x_dims.size();
+        int axis = param_.Axis;
+        if (axis < 0) axis += x_rank;
+    
+    std::vector<int64_t> out_dims;
+        for (int64_t i = 0; i < axis; i++) {
+            out_dims.push_back(x_dims[i]);
+        }
+        for (int64_t i = axis + 1; i < x_rank; i++) {
+            out_dims.push_back(x_dims[i]);
+        }
+    
+      // Set output dims
+        param_.Out->Resize(lite::DDim(out_dims));
+        return true;
+    }
+    
+    bool ArgmaxOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+        auto x = op_desc.Input("X").front();
+        auto out = op_desc.Output("Out").front();
+    
+    param_.X = scope->FindVar(x)->GetMutable<lite::Tensor>();
+        param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+        param_.Axis = op_desc.GetAttr<int>("Axis");
+    
+    return true;
+    }
+    REGISTER_LITE_OP(argmax, paddle::lite::operators::ArgmaxOpLite);
+    ```
+- 在paddlelite/lite/operators/CMakeLists.txt中添加```add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS})```
+
+## 3. 添加Argmax Kernel并绑定
+
+以下以arm端argmax实现为例说明
+- 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.h文件，声明ArgmaxCompute类，并继承KernelLite，主要代码如下：
+    ```c++
+    class ArgmaxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+    public:
+        using param_t = operators::ArgmaxParam;
+        void Run() override;
+        virtual ~ArgmaxCompute() = default;
+    };
+    ```
+- 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.cc文件，主要实现Run函数。`Run()`函数调用paddlelite/lite/bachends/arm/math/argmax.h中的`argmax_func()`函数，根据输入计算输出。最后在argmax_compute.cc文件中，我们绑定argmax的输入输出（为tensor的输入参数都需要绑定），代码如下：
+    ```c++
+    void ArgmaxCompute::Run() {
+        auto& param = Param<operators::ArgmaxParam>();
+        lite::Tensor* input = param.X;
+        lite::Tensor* output = param.Out;
+        int axis = param.Axis;
+        lite::arm::math::argmax_func(input, axis, output);
+        return;
+    }
+
+    REGISTER_LITE_KERNEL(
+        argmax, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ArgmaxCompute, def)
+        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+        .Finalize();
+    ```
+
+- 在paddlelite/lite/kernels/arm/CMakeLists.txt中添加
+    ```cmake
+    add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
+    ```
+
+## 4. 添加Argmax实现
+
+- 在paddlelite/lite/backends/arm/math/目录下新建argmax.h文件，声明`argmax_func()`函数，代码如下：
+    ```c++
+    void argmax_func(const lite::Tensor* input, const int axis, lite::Tensor* output);
+    ```
+- 在paddlelite/lite/backends/arm/math/目录下新建argmax.cc文件，具体实现`argmax_func()`函数，代码如下：
+    ```c++
+    void argmax_func(const lite::Tensor *input,
+                    const int axis,
+                    lite::Tensor *output) {
+    auto input_ddim = input->dims();
+    auto output_ddim = output->dims();
+
+    const int size = input_ddim[axis];
+    const int in_channel = input_ddim.count(axis, input_ddim.size());
+    const int out_channel = output_ddim.count(axis, output_ddim.size());
+    const int in_stride = input_ddim.count(axis + 1, input_ddim.size());
+    const int out_stride = input_ddim.count(0, axis);
+
+    for (int n = 0; n < out_stride; n++) {
+        for (int k = 0; k < in_stride; k++) {
+        const float *in_ptr = input->data<float>() + n * in_channel + k;
+        std::vector<std::pair<float, int>> vec;
+        vec.resize(size);
+        for (int i = 0; i < size; i++) {
+            vec[i] = std::make_pair(in_ptr[i * in_stride], i);
+        }
+        // sort
+        std::partial_sort(vec.begin(),
+                            vec.begin() + 1,
+                            vec.end(),
+                            std::greater<std::pair<float, int>>());
+
+        // out
+        float *out_ptr = output->mutable_data<float>() + n * out_channel + k;
+        *out_ptr = vec[0].second;
+        }
+    }
+    }
+    ```
+- 在paddlelite/lite/backends/arm/math/CMakeFile.txt中的```math_arm library```中添加argmax.cc，在paddlelite/lite/backends/arm/math/funcs.h中添加```#include "lite/arm/math/argmax.h"```
+
+## 5. 添加Argmax单测
+
+- 在paddlelite/lite/tests/kernels目录下新建argmax_compute_test.cc文件，声明并实现ArgmaxComputeTester类；
+- ArgmaxComputeTester类中主要包括PrepareOpDesc、PrepareData和RunBaseline函数。PrepareOpDesc函数设定单测op的类型和输入输出参数，PrepareData函数对输入tensor进行初始化，RunBaseline是基于输入计算得到输出，用于和框架计算的输出进行对比；
+- 使用gtest添加单测，代码如下：
+    ```c++
+    TEST(Argmax, precision) {
+        #ifdef LITE_WITH_ARM
+        LOG(INFO) << "test argmax arm";
+        Place place(TARGET(kARM));
+
+        for (int axis : {0, 1, 2, 3}) {
+            for (int n : {1, 3}) {
+            for (int c : {3, 6}) {
+                for (int h : {9, 18}) {
+                for (int w : {9, 18}) {
+                    std::unique_ptr<arena::TestCase> tester(
+                        new ArgmaxComputeTester(place, "def", axis, n, c, h, w));
+                    arena::Arena arena(std::move(tester), place, 2e-5);
+                    arena.TestPrecision();
+                }
+                }
+            }
+            }
+        }
+        #endif
+    }
+    ```
+- 在paddlelite/lite/tests/kernels/CMakeLists.txt中添加
+    ```cmake
+    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    ```
+## 6. 编译运行
+- 在paddlelite目录中，执行```./lite/tools/ci_build.sh build_test_arm```，该脚本会创建手机模拟器，并编译运行所有单测（花费时间较久）。如果运行无误，则表明添加argmax成功。
diff --git a/docs/advanced_user_guides/index.rst b/docs/advanced_user_guides/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/advanced_user_guides/model_quantization.md b/docs/advanced_user_guides/model_quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..7d781ba9904400c26b64aed5f5dc764ecc5b24fa
--- /dev/null
+++ b/docs/advanced_user_guides/model_quantization.md
@@ -0,0 +1,327 @@
+# 模型量化
+
+本文主要介绍使用Paddle-Lite加载PaddlePaddle产出的量化模型，并进行推理执行。我们以MobileNetV1模型为示例，首先介绍准备量化模型，然后介绍部署执行。
+
+## 准备量化模型
+
+PaddlePaddle使用量化训练和训练后量化两种方法将FP32模型量化成Int8模型，下面分别介绍两种方法如何产出量化模型。
+
+### 量化训练
+
+目前，PaddlePaddle框架的量化训练主要针对卷积层（包括二维卷积和Depthwise卷积）、和全连接层，对应算子是conv2d、depthwise_conv2d和mul，更多量化训练的原理请参考[文档](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/tutorial.md#1-quantization-aware-training%E9%87%8F%E5%8C%96%E4%BB%8B%E7%BB%8D)。Paddle-Lite支持运行PaddlePaddle框架量化训练产出的模型，可以进一步加快模型在移动端的执行速度。
+
+温馨提示：如果您是初次接触PaddlePaddle框架，建议首先学习[新人入门](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)和[使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/index_cn.html)。
+
+
+您可以选择下载训练好的量化模型，或者使用PaddleSlim模型压缩工具训练得到量化模型。
+
+#### 下载量化模型
+
+官方发布了[MobileNetV1量化模型](https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip)，直接下载到本地。
+
+```bash
+wget https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip
+```
+
+#### 使用PaddleSlim模型压缩工具训练量化模型
+
+##### 安装PaddlePaddle
+
+根据操作系统、安装方式、Python版本和CUDA版本，按照[官方说明](https://paddlepaddle.org.cn/start)安装PaddlePaddle。例如：
+
+Ubuntu 16.04.4 LTS操作系统，CUDA9，cuDNN7，GPU版本安装:
+```bash
+pip install paddlepaddle-gpu==1.6.0.post97 -i https://mirrors.aliyun.com/pypi/simple/
+```
+
+Ubuntu 16.04.4 LTS操作系统，CPU版本安装:
+```bash
+pip install paddlepaddle==1.6.0 -i https://mirrors.aliyun.com/pypi/simple/
+```
+
+##### 克隆量化训练所需的代码库
+
+克隆[PaddlePaddle/models](https://github.com/PaddlePaddle/models)到本地，并进入models/PaddleSlim路径。
+
+```bash
+git clone https://github.com/PaddlePaddle/models.git
+cd models/PaddleSlim
+```
+
+##### 数据准备
+###### 训练数据准备
+
+参考[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#data-preparation)中的数据准备教程，下载训练数据，并且保存到PaddleSlim/data路径下。
+
+###### 预训练模型准备
+
+参考/models/PaddleSlim/run.sh脚本， 从[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#supported-models-and-performances)下载MobileNetV1的预训练模型，并保存到PaddleSlim/pretrain路径下。
+
+经过以上三步，PaddleSlim目录下的文件结构如下所示：
+
+```bash
+.
+├── compress.py # 模型压缩任务主脚本，定义了压缩任务需要的模型相关信息
+├── configs # 压缩任务的配置文件，包括:蒸馏、int8量化量化、filter剪切和组合策略的配置文件
+├── data # 存放训练数据（需要用户自己创建）
+│   └── ILSVRC2012
+├── pretrain # 存放预训练模型参数，执行run.sh自动生成
+│   ├── MobileNetV1_pretrained
+│   ├── MobileNetV1_pretrained.tar
+│   ├── ResNet50_pretrained
+│   └── ResNet50_pretrained.tar
+├── docs # 文档目录
+├── light_nas
+├── models # 模型网络结构的定义，如MobileNetV1
+├── quant_low_level_api # 量化训练的底层API, 用于灵活定制量化训练的过程，适用于高阶用户
+├── reader.py # 定义数据处理逻辑
+├── README.md
+├── run.sh # 模型压缩任务启动脚本
+└── utility.py # 定义了常用的工具方法
+```
+
+##### 压缩脚本介绍
+
+在`compress.py`中定义了执行压缩任务需要的所有模型相关的信息，这里对几个关键的步骤进行简要介绍：
+
+###### 目标网络的定义
+
+compress.py的以下代码片段定义了train program, 这里train program只有前向计算操作。
+```python
+out = model.net(input=image, class_dim=args.class_dim)
+cost = fluid.layers.cross_entropy(input=out, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+```
+
+然后，通过clone方法得到eval_program, 用来在压缩过程中评估模型精度，如下：
+
+```python
+val_program = fluid.default_main_program().clone()
+```
+
+定义完目标网络结构，需要对其初始化，并根据需要加载预训练模型。
+
+###### 定义feed_list和fetch_list
+对于train program, 定义train_feed_list用于指定从train data reader中取的数据feed给哪些variable。定义train_fetch_list用于指定在训练时，需要在log中展示的结果。如果需要在训练过程中在log中打印accuracy信心，则将('acc_top1', acc_top1.name)添加到train_fetch_list中即可。
+```python
+train_feed_list = [('image', image.name), ('label', label.name)]
+train_fetch_list = [('loss', avg_cost.name)]
+```
+
+> 注意： 在train_fetch_list里必须有loss这一项。
+
+对于eval program. 同上定义eval_feed_list和train_fetch_list:
+
+```python
+val_feed_list = [('image', image.name), ('label', label.name)]
+val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5', acc_top5.name)]
+```
+
+###### Compressor和量化配置文件
+`compress.py`主要使用Compressor和yaml文件完成对模型的量化训练工作。Compressor类的定义如下：
+```python
+class Compressor(object):
+    def __init__(self,
+                 place,
+                 scope,
+                 train_program,
+                 train_reader=None,
+                 train_feed_list=None,
+                 train_fetch_list=None,
+                 eval_program=None,
+                 eval_reader=None,
+                 eval_feed_list=None,
+                 eval_fetch_list=None,
+                 teacher_programs=[],
+                 checkpoint_path='./checkpoints',
+                 train_optimizer=None,
+                 distiller_optimizer=None):
+```
+
+在定义Compressor对象时，需要注意以下问题：
+* train program如果带反向operators和优化更新相关的operators, 参数train_optimizer需要设置为None.
+* eval_program中parameter的名称需要与train_program中的parameter的名称完全一致。
+* 最终保存的量化模型是在eval_program网络基础上进行剪枝保存的。所以，如果用户希望最终保存的模型可以用于inference, 则eval program需要包含推理阶段需要的各种operators.
+* checkpoint保存的是float数据类型的模型。
+
+`configs/quantization.yaml`量化配置文件示例如下：
+
+```python
+version: 1.0
+strategies:
+    quantization_strategy:
+        class: 'QuantizationStrategy'
+        start_epoch: 0
+        end_epoch: 9
+        float_model_save_path: './output/float'
+        mobile_model_save_path: './output/mobile'
+        int8_model_save_path: './output/int8'
+        weight_bits: 8
+        activation_bits: 8
+        weight_quantize_type: 'abs_max'
+        activation_quantize_type: 'moving_average_abs_max'
+        save_in_nodes: ['image']
+        save_out_nodes: ['fc_0.tmp_2']
+compressor:
+    epoch: 10
+    checkpoint_path: './checkpoints_quan/'
+    strategies:
+        - quantization_strategy
+```
+其中，可配置参数包括：
+- **class:** 量化策略的类名称，目前仅支持`QuantizationStrategy`。
+- **start_epoch:** 在start_epoch开始之前，量化训练策略会往train_program和eval_program插入量化operators和反量化operators。 从start_epoch开始，进入量化训练阶段。
+- **end_epoch:** 在end_epoch结束之后，会保存用户指定格式的模型。注意：end_epoch之后并不会停止量化训练，而是继续训练直到epoch数等于compressor.epoch值为止。举例来说，当start_epoch=0，end_epoch=0，compressor.epoch=2时，量化训练开始于epoch0，结束于epoch1，但保存的模型是epoch0结束时的参数状态。
+- **float_model_save_path:**  保存float数据格式的模型路径，即该路径下的模型参数范围为int8范围但参数数据类型为float32。如果设置为None, 则不存储float格式的模型，默认为None。**注意：Paddle-Lite即使用该目录下的模型进行量化模型推理优化，详见本文[使用Paddle-Lite运行量化模型推理](#二使用Paddle-Lite运行量化模型推理)部分。**
+- **int8_model_save_path:** 保存int8数据格式的模型路径，即该路径下的模型参数范围为int8范围且参数数据类型为int8。如果设置为None, 则不存储int8格式的模型，默认为None.
+- **mobile_model_save_path:** 保存兼容paddle-mobile框架的模型路径。如果设置为None, 则不存储paddle-mobile格式的模型，默认为None。目前paddle-mobile已升级为Paddle-Lite。
+- **weight_bits:** 量化weight的bit数，注意偏置(bias)参数不会被量化。
+- **activation_bits:** 量化activation的bit数。
+-  **weight_quantize_type:** weight量化方式，目前量化训练支持`abs_max`、 `channel_wise_abs_max`。
+- **activation_quantize_type:** activation量化方式，目前量化训练支持`range_abs_max`、`moving_average_abs_max`。PaddlePaddle中还支持 `abs_max` 方法对激活进行量化，但是该方法动态计算输入的量化scale，这会增加计算量、减慢模型推理速度，所以lite不支持 `abs_max`激活量化方式。
+- **save_in_nodes:** variable名称列表。在保存量化后模型的时候，需要根据save_in_nodes对eval programg 网络进行前向遍历剪枝。默认为eval_feed_list内指定的variable的名称列表。
+- **save_out_nodes:** varibale名称列表。在保存量化后模型的时候，需要根据save_out_nodes对eval programg 网络进行回溯剪枝。默认为eval_fetch_list内指定的variable的名称列表。
+
+> **备注：**
+>
+> 1）`abs_max`意为在训练的每个step及inference阶段均动态计算量化scale值。`channel_wise_abs_max`与`abs_max`类似，不同点在于它会对卷积权重进行分channel求取量化scale。换言之，`abs_max`属于tensor-wise量化，而`channel_wise_abs_max`属于channel-wise量化，详细说明请猛戳[此处](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/quantization/training_quantization_model_format.md)。
+> 
+> 2）`moving_average_abs_max`和`range_abs_max`意为在训练阶段计算出一个静态的量化scale值，并将其用于inference阶段。`moving_average_abs_max`使用窗口滑动平均的方法计算量化scale，而`range_abs_max`则使用窗口绝对值最大值的方式。
+> 
+> 3）**目前，Paddle-Lite仅支持运行weight量化方式使用`abs_max`且activation量化方式使用`moving_average_abs_max`或`range_abs_max`产出的量化模型**。
+
+##### 执行int8量化训练
+
+修改run.sh，即注释掉`# enable GC strategy`与`# for sensitivity filter pruning`之间的内容并打开`#for quantization`相关的脚本命令（所需打开注释的命令如下所示）。
+
+```bash
+# for quantization
+#---------------------------
+export CUDA_VISIBLE_DEVICES=0
+python compress.py \
+--batch_size 64 \
+--model "MobileNet" \
+--pretrained_model ./pretrain/MobileNetV1_pretrained \
+--compress_config ./configs/quantization.yaml \
+--quant_only True
+```
+最后，运行`sh run.sh`命令开始int8量化训练。
+
+上述量化训练过程完成后，若按照本文中所述`configs/quantization.yaml`文件内容配置的模型输出路径，则可在models/PaddleSlim/output目录下看到`float`、`int8`和`mobile`三个目录，其中：
+* float目录: 参数范围为int8范围但参数数据类型为float32的量化模型。Paddle-Lite即使用该目录下的模型文件及参数进行量化模型的部署。
+* int8目录: 参数范围为int8范围且参数数据类型为int8的量化模型。
+* mobile目录：参数特点与int8目录相同且兼容paddle-mobile的量化模型（目前paddle-mobile已升级为Paddle-Lite）。
+
+### 训练后量化
+
+下面以MobileNetV1为例，介绍使用训练后量化方法产出量化模型。关于训练后量化的原理和详细使用方法，请参考[文档](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)。
+
+> 该示例的代码放在[models/PaddleSlim/quant_low_level_api/](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)目录下。如果需要执行该示例，首先clone下来[models](https://github.com/PaddlePaddle/models.git)，安装具有训练后量化功能的PaddlePaddle。因为目前Lite支持支持对conv2d、depthwise_conv2d和mul量化，所以修改[run_post_training_quanzation.sh](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/run_post_training_quanzation.sh) 脚本，设置is_full_quantize=False，然后执行该脚本；执行结束后，量化模型保存在`mobilenetv1_int8_model`目录下。下面介绍详细步骤。
+
+1）**准备模型和校准数据**
+
+安装PaddlePaddle的develop分支编译的whl包，准备已经训练好的FP32预测模型。
+
+准备校准数据，文件结构如下。val文件夹中有100张图片，val_list.txt文件中包含图片的label。
+```bash
+samples_100
+└──val
+└──val_list.txt
+```
+
+2）**配置校准数据生成器**
+
+MobileNetV1的输入是图片和标签，所以配置读取校准数据的sample_generator，每次返回一张图片和一个标签。详细代码在[models/PaddleSlim/reader.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/reader.py)。
+
+3）**调用训练后量化**
+
+调用训练后量化的核心代码如下，详细代码在[post_training_quantization.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/post_training_quantization.py)。
+``` python
+place = fluid.CUDAPlace(0) if args.use_gpu == "True" else fluid.CPUPlace()
+exe = fluid.Executor(place)
+sample_generator = reader.val(data_dir=args.data_path)
+
+ptq = PostTrainingQuantization(
+    executor=exe,
+    sample_generator=sample_generator,
+    model_dir=args.model_dir,
+    model_filename=args.model_filename,
+    params_filename=args.params_filename,
+    batch_size=args.batch_size,
+    batch_nums=args.batch_nums,
+    algo=args.algo,
+    is_full_quantize=args.is_full_quantize == "True")
+quantized_program = ptq.quantize()
+ptq.save_quantized_model(args.save_model_path)
+```
+
+## 使用Paddle-Lite运行量化模型推理
+
+#### 使用模型优化工具对量化模型进行优化
+
+接下来，使用原始的量化模型生成适合在移动端直接部署的模型。
+
+参考[源码编译](../source_compile)配置编译环境，确保可以编译成功。参考[模型转化方法](../model_optimize_tool)，首先编译model_optimize_tool工具，然后执行下面命令对量化训练的模型进行优化（注意，需要自行修改model_file、param_file和optimize_out）。
+```bash
+./model_optimize_tool                         \
+--model_file=mobilenet_v1_quant/float/model   \
+--param_file=mobilenet_v1_quant/float/weights \
+--optimize_out_type=naive_buffer              \
+--optimize_out=mobilenet_v1_quant_opt         \
+--valid_targets=arm                           \
+--prefer_int8_kernel=true
+```
+
+如前所述，量化训练后，float目录下的模型参数范围为int8，但参数数据类型仍为float32类型，这样确实没有起到模型参数压缩的效果。但是，经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果，且模型结构也被优化（如进行了各种operator fuse操作）。
+
+#### 在手机端准备量化模型文件
+
+使用如下命令将mobilenet_v1_quant_opt目录下的量化模型文件导入到手机端：
+
+```bash
+adb push mobilenet_v1_quant_opt /data/local/tmp
+```
+
+#### 使用mobilenetv1\_light\_api运行优化后的量化模型
+
+参考[源码编译](../source_compile)配置编译环境后，在Paddle-Lite执行如下命令获取轻量级API的demo：
+
+```bash
+cd /Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light
+make clean && make -j
+```
+执行完上述命令后，可在`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/`路径下看到`mobilenetv1_light_api`可执行文件。将`mobilenetv1_light_api`导入到手机端并运行量化模型推理。执行命令如下：
+
+```bash
+adb push Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp
+adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
+adb shell /data/local/tmp/mobilenetv1_light_api               \
+    --model_dir=/data/local/tmp/mobilenet_v1_quant_opt
+```
+**程序运行结果如下：**
+```bash
+Output dim: 1000
+Output[0]: 0.000228
+Output[100]: 0.000260
+Output[200]: 0.000250
+Output[300]: 0.000560
+Output[400]: 0.000950
+Output[500]: 0.000275
+Output[600]: 0.005143
+Output[700]: 0.002509
+Output[800]: 0.000538
+Output[900]: 0.000969
+```
+在C++中使用Paddle-Lite API的方法请猛戳[此处](../cpp_demo)，用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。
+
+### FAQ
+
+**问题**：Compiled with WITH_GPU, but no GPU found in runtime
+
+**解答**：检查本机是否支持GPU训练，如果不支持请使用CPU训练。如果在docker进行GPU训练，请使用nvidia_docker启动容器。
+
+**问题**：Inufficient GPU memory to allocation. at [/paddle/paddle/fluid/platform/gpu_info.cc:262]
+  
+**解答**：正确设置run.sh脚本中`CUDA_VISIBLE_DEVICES`，确保显卡剩余内存大于需要内存。
diff --git a/docs/advanced_user_guides/support_operation_list.md b/docs/advanced_user_guides/support_operation_list.md
new file mode 100644
index 0000000000000000000000000000000000000000..7c2ceb0ff819f7f1676308a33ec88f5eab820e57
--- /dev/null
+++ b/docs/advanced_user_guides/support_operation_list.md
@@ -0,0 +1,392 @@
+# 支持OP列表
+
+## Ops
+
+- affine_channel
+- anchor_generator
+- arg_max
+- assign
+- assign_value
+- attention_padding_mask
+- axpy
+- batch_norm
+- beam_search
+- beam_search_decode
+- bilinear_interp
+- box_clip
+- box_coder
+- calib
+- calib_once
+- cast
+- collect_fpn_proposals
+- concat
+- conditional_block
+- conv2d
+- conv2d_transpose
+- crop
+- decode_bboxes
+- density_prior_box
+- depthwise_conv2d
+- distribute_fpn_proposals
+- dropout
+- elementwise_add
+- elementwise_div
+- elementwise_max
+- elementwise_mul
+- elementwise_sub
+- equal
+- exp
+- expand
+- fake_channel_wise_dequantize_max_abs
+- fake_dequantize_max_abs
+- fake_quantize_dequantize_moving_average_abs_max
+- fake_quantize_moving_average_abs_max
+- fake_quantize_range_abs_max
+- fc
+- feed
+- fetch
+- fill_constant
+- fill_constant_batch_size_like
+- flatten
+- flatten2
+- floor
+- fusion_elementwise_add_activation
+- fusion_elementwise_div_activation
+- fusion_elementwise_max_activation
+- fusion_elementwise_mul_activation
+- fusion_elementwise_sub_activation
+- gather
+- generate_proposals
+- graph_op
+- greater_equal
+- greater_than
+- gru
+- gru_unit
+- hard_sigmoid
+- im2sequence
+- increment
+- instance_norm
+- io_copy
+- io_copy_once
+- is_empty
+- layer_norm
+- layout
+- layout_once
+- leaky_relu
+- less_equal
+- less_than
+- lod_reset
+- log
+- logical_and
+- logical_not
+- logical_or
+- logical_xor
+- lookup_table
+- lookup_table_v2
+- lrn
+- match_matrix_tensor
+- matmul
+- mean
+- merge_lod_tensor
+- mul
+- multiclass_nms
+- nearest_interp
+- negative
+- norm
+- notequal
+- pad2d
+- pool2d
+- power
+- prelu
+- prior_box
+- range
+- read_from_array
+- reduce_max
+- reduce_mean
+- reduce_prod
+- reduce_sum
+- relu
+- relu6
+- relu_clipped
+- reshape
+- reshape2
+- roi_align
+- rsqrt
+- scale
+- search_aligned_mat_mul
+- search_attention_padding_mask
+- search_fc
+- search_grnn
+- search_group_padding
+- search_seq_arithmetic
+- search_seq_depadding
+- search_seq_fc
+- search_seq_softmax
+- sequence_arithmetic
+- sequence_concat
+- sequence_expand
+- sequence_expand_as
+- sequence_pool
+- sequence_reshape
+- sequence_reverse
+- sequence_softmax
+- sequence_topk_avg_pooling
+- shape
+- shuffle_channel
+- sigmoid
+- slice
+- softmax
+- softsign
+- split
+- split_lod_tensor
+- sqrt
+- square
+- squeeze
+- squeeze2
+- stack
+- swish
+- tanh
+- top_k
+- transpose
+- transpose2
+- uniform_random
+- unsqueeze
+- unsqueeze2
+- var_conv_2d
+- while
+- write_to_array
+- yolo_box
+
+## Kernels
+
+### Host kernels
+
+- feed
+- fetch
+- flatten
+- flatten2
+- multiclass_nms
+- reshape
+- reshape2
+
+### ARM kernels
+
+- affine_channel
+- anchor_generator
+- arg_max
+- assign
+- assign_value
+- axpy
+- batch_norm
+- beam_search
+- beam_search_decode
+- bilinear_interp
+- box_clip
+- box_coder
+- cast
+- collect_fpn_proposals
+- concat
+- conditional_block
+- conv2d
+- conv2d_transpose
+- crop
+- decode_bboxes
+- density_prior_box
+- depthwise_conv2d
+- distribute_fpn_proposals
+- dropout
+- elementwise_add
+- elementwise_div
+- elementwise_max
+- elementwise_mul
+- elementwise_sub
+- equal
+- exp
+- expand
+- fc
+- fill_constant
+- fill_constant_batch_size_like
+- floor
+- fusion_elementwise_add_activation
+- fusion_elementwise_div_activation
+- fusion_elementwise_max_activation
+- fusion_elementwise_mul_activation
+- fusion_elementwise_sub_activation
+- gather
+- generate_proposals
+- greater_equal
+- greater_than
+- gru
+- gru_unit
+- hard_sigmoid
+- im2sequence
+- increment
+- instance_norm
+- is_empty
+- layer_norm
+- layout
+- layout_once
+- leaky_relu
+- less_equal
+- less_than
+- lod_reset
+- log
+- logical_and
+- logical_not
+- logical_or
+- logical_xor
+- lookup_table
+- lookup_table_v2
+- lrn
+- matmul
+- merge_lod_tensor
+- mul
+- nearest_interp
+- negative
+- norm
+- not_equal
+- pad2d
+- pool2d
+- power
+- prelu
+- prior_box
+- range
+- read_from_array
+- reduce_max
+- reduce_mean
+- reduce_prod
+- relu
+- relu6
+- relu_clipped
+- roi_align
+- rsqrt
+- scale
+- sequence_expand
+- sequence_pool
+- sequence_softmax
+- shape
+- shuffle_channel
+- sigmoid
+- slice
+- softmax
+- split
+- split_lod_tensor
+- squeeze
+- squeeze2
+- stack
+- swish
+- tanh
+- top_k
+- transpose
+- transpose2
+- unsqueeze
+- unsqueeze2
+- while
+- write_to_array
+- yolo_box
+
+
+### X86 kernels
+- batch_norm
+- cast
+- concat
+- conv2d
+- depthwise_conv2d
+- dropout
+- elementwise_add
+- elementwise_sub
+- fc
+- fill_constant_batch_size_like
+- gather
+- gelu
+- gru
+- layer_norm
+- match_matrix_tensor
+- matmul
+- mul
+- pool2d
+- reduce_sum
+- relu
+- reshape
+- reshape2
+- scale
+- search_aligned_mat_mul
+- search_attention_padding_mask
+- search_fc
+- search_grnn
+- search_group_padding
+- search_seq_arithmetic
+- search_seq_depadding
+- search_seq_fc
+- search_seq_softmax
+- sequence_arithmetic
+- sequence_concat
+- sequence_expand_as
+- sequence_pool
+- sequence_reverse
+- sequence_topk_avg_pooling
+- shape
+- slice
+- softmax
+- softsign
+- square
+- squeeze
+- squeeze2
+- stack
+- tanh
+- transpose
+- transpose2
+- var_conv_2d
+
+### CUDA kernels
+- attention_padding_mask
+- bilinear_interp
+- calib
+- concat
+- conv
+- dropout
+- elementwise_add
+- fusion_elementwise_add_activation
+- fusion_elementwise_mul_activation
+- elementwise_mul
+- feed
+- io_copy
+- layout
+- layout_once
+- leaky_relu
+- lookup_table
+- match_matrix_tensor
+- mul
+- nearest_interp
+- pool2d
+- relu
+- scale
+- search_aligned_mat_mul
+- search_fc
+- search_grnn
+- search_group_padding
+- search_seq_depadding
+- search_seq_fc
+- sequence_arithmetic
+- sequence_concat
+- sequence_pool
+- sequence_reverse
+- sequence_topk_avg_pooling
+- softmax
+- transpose
+- var_conv_2d
+- yolo_box
+
+### OpenCL kernels
+- conv2d
+- depthwise_conv2d
+- elementwise_add
+- fc
+- fusion_elementwise_add_activation
+- layout
+- layout_once
+- io_copy
+- io_copy_once
+- mul
+- pool2d
+- relu
diff --git a/docs/advanced_user_guides/x86.md b/docs/advanced_user_guides/x86.md
new file mode 100644
index 0000000000000000000000000000000000000000..7cb08683440312b0349662699b05e99df0cb6df1
--- /dev/null
+++ b/docs/advanced_user_guides/x86.md
@@ -0,0 +1,104 @@
+# 使用X86预测库
+
+Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../installation/source_compile)。
+
+(注意：非docker Linux环境需要是Ubuntu16.04)
+
+## 编译
+
+1、 下载代码
+```bash
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+#需要切换到 release/v2.0.0之后版本
+git checkout <release_tag>
+```
+
+2、 源码编译
+
+```bash
+cd Paddle-Lite
+./lite/tools/build.sh x86
+```
+
+## 编译结果说明
+
+x86编译结果位于 `build.lite.x86/inference_lite_lib`
+**具体内容**说明：
+
+1、 `bin`文件夹：可执行工具文件 `test_model_bin`
+
+2、 `cxx`文件夹：包含c++的库文件与相应的头文件
+
+- `include`  : 头文件
+- `lib` : 库文件
+  - 打包的静态库文件：
+    - `libpaddle_api_full_bundled.a`  ：包含 full_api 和 light_api 功能的静态库
+    - `libpaddle_api_light_bundled.a` ：只包含 light_api 功能的静态库
+  - 打包的动态态库文件：
+    - `libpaddle_full_api_shared.so` ：包含 full_api 和 light_api 功能的动态库
+    - `libpaddle_light_api_shared.so`：只包含 light_api 功能的动态库
+
+3、 `third_party` 文件夹：第三方库文件
+
+## x86预测API使用示例
+
+```c++
+#include <gflags/gflags.h>
+#include <iostream>
+#include <vector>
+#include "paddle_api.h"          // NOLINT
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_use_passes.h"   // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+DEFINE_string(model_dir, "", "Model dir path.");
+DEFINE_string(optimized_model_dir, "", "Optimized model dir.");
+DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+void RunModel() {
+  // 1. Set CxxConfig
+  CxxConfig config;
+  config.set_model_file(FLAGS_model_dir + "model");
+  config.set_param_file(FLAGS_model_dir + "params");
+
+  config.set_valid_places({
+    lite_api::Place{TARGET(kX86), PRECISION(kFloat)}
+  });
+
+  // 2. Create PaddlePredictor by CxxConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<CxxConfig>(config);
+
+  // 3. Prepare input data
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(shape_t({1, 3, 224, 224}));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = 1;
+  }
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  std::cout << "Output dim: " << output_tensor->shape()[1] << std::endl;
+  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+    std::cout << "Output[" << i << "]:" << output_tensor->data<float>()[i] << std::endl;
+  }
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  RunModel();
+  return 0;
+}
+```
diff --git a/docs/api_reference/cxx_api_doc.md b/docs/api_reference/cxx_api_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..38385a4267d5727d9c5c7d985d3457dd011e203c
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc.md
@@ -0,0 +1,874 @@
+
+# C++ API文档
+
+## CreatePaddlePredictor
+
+```c++
+template <typename ConfigT>
+std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);
+```
+
+`CreatePaddlePredictor`用来根据`MobileConfig`构建预测器。
+
+示例：
+
+```c++
+// 设置MobileConfig
+MobileConfig config;
+config.set_model_dir(FLAGS_model_dir);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+参数：
+
+- `config(MobileConfig)` - 用于构建Predictor的配置信息。
+
+返回：`PaddlePredictor`指针
+
+返回类型：`std::shared_ptr<PaddlePredictor>`
+
+## CxxConfig
+
+```c++
+class CxxConfig;
+```
+
+`CxxConfig`用来配置构建CxxPredictor的配置信息，如protobuf格式的模型地址、能耗模式、工作线程数、place信息等等。
+
+示例：
+
+```c++
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置工作线程数
+config.set_threads(4);
+# 设置能耗模式
+config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+# 设置valid places
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+### `set_model_dir(model_dir)`
+
+设置模型文件夹路径，当需要从磁盘加载非combined模型时使用。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`str`
+
+
+
+### `set_model_file(model_file)`
+
+设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `model_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `model_file()`
+
+获取设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型文件路径
+
+返回类型：`str`
+
+
+
+### `set_param_file(param_file)`
+
+设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `param_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `param_file()`
+
+获取设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型参数文件路径
+
+返回类型：`str`
+
+
+
+### `set_valid_places(valid_places)`
+
+设置可用的places列表。
+
+参数：
+
+- `valid_places(list)` - 可用place列表。
+
+返回类型：`None`
+
+示例：
+
+```c++
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置valid places
+# 注意，valid_places列表中Place的排序表明了用户对Place的偏好程度，如用户想优先使用ARM上Int8精度的
+# kernel，则应把Place(TargetType.ARM, PrecisionType.INT8)置于valid_places列表的首位。
+places = [Place(TargetType.ARM, PrecisionType.INT8),
+          Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`PowerMode.LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `threads()`
+
+获取设置的工作线程数。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
+
+
+### `set_x86_math_library_num_threads(threads)`
+
+设置CPU Math库线程数，CPU核心数支持情况下可加速预测。默认为1，并且仅在x86下有效。
+
+参数：
+
+- `threads(int)` - CPU Math库线程数。
+
+返回：`None`
+
+返回类型：`None`
+
+
+### `x86_math_library_num_threads()`
+
+返回CPU Math库线程数，CPU核心数支持情况下可加速预测。仅在x86下有效。
+
+参数：
+
+- `None`
+
+返回：CPU Math库线程数。
+
+返回类型：`int`
+
+## MobileConfig
+
+```c++
+class MobileConfig;
+```
+
+`MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息，如NaiveBuffer格式的模型地址、模型的内存地址(从内存加载模型时使用)、能耗模式、工作线程数等等。
+
+*注意：输入的模型需要使用[Model Optimize Tool](../model_optimize_tool)转化为NaiveBuffer格式的优化模型。*
+
+示例：
+
+```c++
+MobileConfig config;
+// 设置NaiveBuffer格式模型目录，从文件加载模型时使用
+config.set_model_dir(FLAGS_model_dir);
+// 设置工作线程数
+config.set_threads(4);
+// 设置能耗模式
+config.set_power_mode(LITE_POWER_HIGH);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+### `set_model_from_file(model_dir)`
+
+设置模型文件，当需要从磁盘加载模型时使用。
+
+参数：
+
+- `model_dir(std::string)` - 模型文件路径
+
+返回：`None`
+
+返回类型：`void`
+
+### `set_model_dir(model_dir)`
+
+**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`set_model_from_file`接口。
+
+设置模型文件夹路径，当需要从磁盘加载模型时使用。
+
+参数：
+
+- `model_dir(std::string)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`std::string`
+
+### `set_model_from_buffer(model_buffer)`
+
+设置模型的内存数据，当需要从内存加载模型时使用。
+
+参数：
+
+- `model_buffer(std::string)` - 内存中的模型数据
+
+返回：`None`
+
+返回类型：`void`
+
+### `set_model_buffer(model_buffer, model_buffer_size, param_buffer, param_buffer_size)`
+
+**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`set_model_from_buffer`接口。
+
+设置模型、参数的内存地址，当需要从内存加载模型时使用。
+
+示例：
+
+```c++
+// 读取模型文件到内存
+std::string model_buffer = ReadFile(FLAGS_model_path);
+std::string params_buffer = lite::ReadFile(FLAGS_params_path);
+
+// 设置MobileConfig
+lite_api::MobileConfig config;
+config.set_model_buffer(model_buffer.c_str(), model_buffer.size(), 
+                        params_buffer.c_str(), params_buffer.size());
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+参数：
+
+- `model_buffer(const char*)` - 内存中模型结构数据。
+- `model_buffer_size(size_t)` - 内存中模型结构数据的大小。
+- `param_buffer(const char*)` - 内存中模型参数数据。
+- `param_buffer_size(size_t)` - 内存中模型参数数据的大小。
+
+返回：`None`
+
+返回类型：`Void`
+
+
+
+### `model_from_memory()`
+
+是否从内存中加载模型，当使用`set_model_buffer`接口时返回`true`
+
+参数：
+
+- `None`
+
+返回：是否从内存加载模型
+
+返回类型：`bool`
+
+
+
+### `model_buffer()`
+
+获取内存中模型结构数据。
+
+参数：
+
+- `None`
+
+返回：内存中模型结构数据
+
+返回类型：`const std::string&`
+
+
+
+### `param_buffer()`
+
+获取内存中模型参数数据。
+
+参数：
+
+- `None`
+
+返回：内存中模型结构数据
+
+返回类型：`const std::string&`
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式。
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `threads()`
+
+获取设置的工作线程数。
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
+
+## PaddlePredictor
+
+```c++
+class PaddlePredictor
+```
+
+`PaddlePredictor`是Paddle-Lite的预测器，由`CreatePaddlePredictor`根据`MobileConfig`进行创建。用户可以根据PaddlePredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+示例：
+
+```c++
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+// 设置MobileConfig
+MobileConfig config;
+config.set_model_dir(FLAGS_model_dir);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+// 获得模型的输入和输出名称
+std::vector<std::string> input_names = predictor->GetInputNames();
+for (int i = 0; i < input_names.size(); i ++) {
+  printf("Input name[%d]: %s\n", i, input_names[i].c_str());
+}
+std::vector<std::string> output_names = predictor->GetOutputNames();
+for (int i = 0; i < output_names.size(); i ++) {
+  printf("Output name[%d]: %s\n", i, output_names[i].c_str());
+}
+
+// 准备输入数据
+// (1)根据index获取输入Tensor
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+// (2)根据名称获取输入Tensor
+// std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInputByName(input_names[0])));
+input_tensor->Resize({1, 3, 224, 224});
+auto* data = input_tensor->mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+
+// 执行预测
+predictor->Run();
+
+// 获取输出
+// (1)根据index获取输出Tensor
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+// (2)根据名称获取输出Tensor
+// std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(output_names[0])));
+printf("Output dim: %d\n", output_tensor->shape()[1]);
+for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+  printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+}
+```
+
+### `GetInput(index)`
+
+获取输入Tensor指针，用来设置模型的输入数据。
+
+参数：
+
+- `index(int)` - 输入Tensor的索引
+
+返回：第`index`个输入`Tensor`的指针
+
+返回类型：`std::unique_ptr<Tensor>`
+
+
+
+### `GetOutput(index)`
+
+获取输出Tensor的指针，用来获取模型的输出结果。
+
+参数：
+
+- `index(int)` - 输出Tensor的索引
+
+返回：第`index`个输出Tensor`的指针
+
+返回类型：`std::unique_ptr<Tensor>`
+
+### `GetInputNames()`
+
+获取所有输入Tensor的名称。
+
+参数：
+
+- `None` 
+
+返回：所有输入Tensor的名称
+
+返回类型：`std::vector<std::string>`
+
+### `GetOutputNames()`
+
+获取所有输出Tensor的名称。
+
+参数：
+
+- `None`
+
+返回：所有输出Tensor的名称
+
+返回类型：`std::vector<std::string>`
+
+### `GetInputByName(name)`
+
+根据名称获取输出Tensor的指针，用来获取模型的输出结果。
+
+参数：
+
+- `name(const std::string)` - 输入Tensor的名称
+
+返回：输入Tensor`的指针
+
+返回类型：`std::unique_ptr<Tensor>`
+
+### `GetTensor(name)`
+
+根据名称获取输出Tensor的指针。
+
+**注意**：`GetTensor`接口是为开发者设计的调试接口，可以输出[转化](../model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`，可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。
+
+参数：
+
+- `name(const std::string)` - Tensor的名称
+
+返回：指向`const Tensor`的指针
+
+返回类型：`std::unique_ptr<const Tensor>`
+
+### `Run()`
+
+执行模型预测，需要在***设置输入数据后***调用。
+
+参数：
+
+- `None`
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `GetVersion()`
+
+用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
+
+参数：
+
+- `None`
+
+返回：当前lib使用的代码版本信息
+
+返回类型：`std::string`
+
+## TargetType
+
+```c++
+class TargetType;
+```
+`TargetType`为目标设备硬件类型，用户可以根据应用场景选择硬件平台类型。
+
+枚举型变量`TargetType`的所有可能取值包括：
+
+`{X86, CUDA, ARM, OpenCL, FPGA, NPU}`
+
+
+## PrecisionType
+```c++
+class PrecisionType {FP32};
+```
+`PrecisionType`为模型中Tensor的数据精度，默认值为FP32(float32)。
+
+枚举型变量`PrecisionType`的所有可能取值包括：
+
+`{FP32, INT8, INT32, INT64}`
+
+
+
+
+## DataLayoutType
+
+```c++
+class DataLayoutType {NCHW};
+```
+`DataLayoutType`为Tensor的数据格式，默认值为NCHW（number, channel, height, weigth）。
+
+枚举型变量`DataLayoutType`的所有可能取值包括：
+
+` {NCHW, NHWC}`
+
+
+
+## Place
+```c++
+class Place{
+  TargetType target;
+  PrecisionType precision{FP32};
+  DataLayoutType layout{NCHW}
+}
+```
+`Place`是`TargetType`、`PrecisionType`和`DataLayoutType`的集合，说明运行时的设备类型、数据精度和数据格式。
+
+示例：
+```C++
+Place{TargetType(ARM), PrecisionType(FP32), DataLayoutType(NCHW)}
+```
+
+## PowerMode
+
+```c++
+enum PowerMode;
+```
+
+`PowerMode`为ARM CPU能耗模式，用户可以根据应用场景设置能耗模式获得最优的能效比。
+
+示例：
+
+```c++
+MobileConfig config;
+// 设置NaiveBuffer格式模型目录
+config.set_model_dir(FLAGS_model_dir);
+// 设置能耗模式
+config.set_power_mode(LITE_POWER_HIGH);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+PowerMode详细说明如下：
+
+|         选项         | 说明                                                         |
+| :------------------: | ------------------------------------------------------------ |
+|   LITE_POWER_HIGH    | 绑定大核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Big cluster。如果设置的线程数大于大核数量，则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败，如果失败则进入不绑核模式。 |
+|    LITE_POWER_LOW    | 绑定小核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Little cluster。如果设置的线程数大于小核数量，则会将线程数自动缩放到小核数量。如果找不到小核，则自动进入不绑核模式。 |
+|   LITE_POWER_FULL    | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时，则会自动将线程数缩放到核心数量。 |
+|  LITE_POWER_NO_BIND  | 不绑核运行模式（推荐）。系统根据负载自动调度任务到空闲的CPU核心上。 |
+| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+| LITE_POWER_RAND_LOW  | 轮流绑定小核模式。如果Little cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+
+
+
+## Tensor
+
+```c++
+class Tensor
+```
+
+Tensor是Paddle-Lite的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置Shape、数据、LoD信息等。
+
+*注意：用户应使用`PaddlePredictor`的`GetInput`和`GetOuput`接口获取输入/输出的`Tensor`。*
+
+示例：
+
+```c++
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+// 设置MobileConfig
+MobileConfig config;
+config.set_model_dir(FLAGS_model_dir);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+// 准备输入数据, 获取输入Tensor
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+// 设置输入Tensor维度信息
+input_tensor->Resize({1, 3, 224, 224});
+// 设置输入数据
+auto* data = input_tensor->mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+
+// 执行预测
+predictor->Run();
+
+// 获取输出Tensor
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+// 获取输出Tensor维度
+printf("Output dim: %d\n", output_tensor->shape()[1]);
+// 获取输出Tensor数据
+for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+  printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+}
+```
+
+### `Resize(shape)`
+
+设置Tensor的维度信息。
+
+参数：
+
+- `shape(std::vector<int64_t>)` - 维度信息
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `shape()`
+
+获取Tensor的维度信息。
+
+参数：
+
+- `None`
+
+返回：Tensor的维度信息
+
+返回类型：`std::vector<int64_t>`
+
+
+
+### `data<T>()`
+
+```c++
+template <typename T>
+const T* data() const;
+```
+
+获取Tensor的底层数据的常量指针，根据传入的不同模型类型获取相应数据。用于读取Tensor数据。
+
+示例：
+
+```c++
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+// 如果模型中输出为float类型
+output_tensor->data<float>()
+```
+
+参数：
+
+- `None`
+
+返回：`Tensor`底层数据常量指针
+
+返回类型：`const T*`
+
+
+
+### `mutable_data<T>()`
+
+```c++
+template <typename T>
+T* mutable_data() const;
+```
+
+获取Tensor的底层数据的指针，根据传入的不同模型类型获取相应数据。用于设置Tensor数据。
+
+示例：
+
+```c++
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+// 如果模型中输出为float类型
+auto* data = input_tensor->mutable_data<float>();
+// 设置Tensor数据
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+```
+
+参数：
+
+- `None`
+
+返回：`Tensor`底层数据指针
+
+返回类型：`T*`
+
+
+
+### `SetLoD(lod)`
+
+设置Tensor的LoD信息。
+
+参数：
+
+- `lod(std::vector<std::vector<uint64_t>>)` - Tensor的LoD信息
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `lod()`
+
+获取Tensor的LoD信息
+
+参数：
+
+- `None`
+
+返回：`Tensor`的LoD信息
+
+返回类型：`std::vector<std::vector<uint64_t>>`
diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/benchmark/benchmark.md b/docs/benchmark/benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..efb0805fddc0bd62a2b21a130018edaa9213e0cf
--- /dev/null
+++ b/docs/benchmark/benchmark.md
@@ -0,0 +1,150 @@
+# Benchmark 数据
+
+可以参考[benchmark_tools](benchmark_tools)，推荐**一键benchmark**。
+
+## 测试环境
+
+* 测试模型
+    * fp32模型
+        * mobilenet_v1
+        * mobilenet_v2
+        * squeezenet_v1.1
+        * mnasnet
+        * shufflenet_v2
+    
+    * int8模型
+        * mobilenet_v1
+        * mobilenet_v2
+        * resnet50
+
+* 测试机器(android ndk ndk-r17c)
+   *  骁龙855
+      * xiaomi mi9, snapdragon 855 
+      * 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz
+
+
+   *  骁龙845
+      * xiaomi mi8, 845
+      * 2.8GHz（大四核），1.7GHz（小四核）
+
+   *  骁龙835
+      * xiaomi mix2, snapdragon 835
+      * 2.45GHz（大四核），1.9GHz（小四核）
+ 
+   *  骁龙625
+      * oppo R9s, snapdragon625
+      * A53 x 8, big core@2.0GHz
+ 
+   * 骁龙653
+      * 360 N5, snapdragon 653
+      * 4 x A73@2.0GHz + 4 x A53@1.4GHz
+ 
+   * 麒麟970
+      * HUAWEI Mate10
+ 
+* 测试说明
+    * branch: release/2.0.0
+    * warmup=10, repeats=30，统计平均时间，单位是ms
+    * 当线程数为1时，```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH，否者设置LITE_POWER_NO_BIND
+    * 模型的输入图像的维度是{1, 3, 224, 224}，输入图像的每一位数值是1
+    
+## 测试数据
+
+
+### fp32模型测试数据
+
+#### paddlepaddle model
+
+
+骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 
+mobilenet_v1 |32.19 |18.81 |10.90 |30.92 |18.31 |10.15 
+mobilenet_v2 |22.91 |13.75 |8.64 |21.15 |12.79 |7.84 
+shufflenet_v2 |4.67 |3.37 |2.65 |4.43 |3.15 |2.66 
+squeezenet_v1.1 |25.10 |15.93 |9.68 |23.28 |14.61 |8.71 
+mnasnet |21.84 |13.14 |7.96 |19.61 |11.88 |7.55
+
+
+
+骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 
+mobilenet_v1 |94.13 |52.17 |30.68 |88.28 |47.58 |26.64 
+mobilenet_v2 |61.24 |34.64 |22.36 |56.66 |32.19 |19.63 
+shufflenet_v2 |10.87 |6.92 |5.12 |10.41 |6.76 |4.97 
+squeezenet_v1.1 |73.61 |42.25 |24.44 |64.87 |38.43 |23.06 
+mnasnet |58.22 |33.43 |20.44 |53.43 |30.20 |18.09 
+
+
+麒麟980|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 
+mobilenet_v1 |55.11 |28.24 |13.27 |34.24 |17.74 |12.41 
+mobilenet_v2 |37.03 |19.80 |51.94 |23.64 |12.98 |9.38 
+shufflenet_v2 |7.26 |4.94 |15.06 |5.32 |3.33 |2.82 
+squeezenet_v1.1 |42.73 |23.66 |57.39 |26.03 |14.53 |13.66 
+mnasnet |36.87 |20.15 |46.04 |21.85 |12.06 |8.68 
+
+麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 
+mobilenet_v1 |97.80 |52.64 |34.46 |94.51 |49.36 |28.43 
+mobilenet_v2 |66.55 |38.52 |23.19 |62.89 |34.93 |21.53 
+shufflenet_v2 |13.78 |8.11 |5.93 |11.95 |7.90 |5.91 
+squeezenet_v1.1 |77.64 |43.67 |25.72 |69.91 |40.66 |24.62 
+mnasnet |61.86 |34.62 |22.68 |59.61 |32.79 |19.56 
+
+#### caffe model
+
+骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |32.42 |18.68 |10.86 |30.92 |18.35 |10.07 |
+mobilenet_v2 |29.53 |17.76 |10.89 |27.19 |16.53 |9.75 |
+shufflenet_v2 |4.61 |3.29 |2.61 |4.36 |3.11 |2.51 |
+
+
+骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |92.52 |52.34 |30.37 |88.31 |49.75 |27.29 |
+mobilenet_v2 |79.50 |45.67 |28.79 |76.13 |44.01 |26.13 |
+shufflenet_v2 |10.94 |7.08 |5.16 |10.64 |6.83 |5.01 |
+
+
+麒麟980|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |55.36 |28.18 |13.31 |34.42 |17.93 |12.52 |
+mobilenet_v2 |49.17 |26.10 |65.49 |30.50 |16.66 |11.72 |
+shufflenet_v2 |8.45 |5.00 |15.65 |4.58 |3.14 |2.83 |
+
+
+麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |97.85 |53.38 |33.85 |94.29 |49.42 |28.29 |
+mobilenet_v2 |87.40 |50.25 |31.85 |85.55 |48.11 |28.24 |
+shufflenet_v2 |12.16 |8.39 |6.21 |12.21 |8.33 |6.32 |
+
+#### int8量化模型测试数据
+
+骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |36.80 |21.58 |11.12 | 14.01 |8.13 |4.32 |
+mobilenet_v2 |28.72 |19.08 |12.49 | 17.24 |11.55 |7.82 |
+
+骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |60.76 |32.25 |16.66 |56.57 |29.84 |15.24 |
+mobilenet_v2 |49.38 |31.10 |22.07 |47.52 |28.18 |19.24 |
+
+
+麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |65.95 |34.39 |18.68 |60.86 |30.98 |16.31 |
+mobilenet_v2 |68.87 |39.39 |24.43 |65.57 |37.31 |20.87 |
diff --git a/docs/benchmark/benchmark_tools.md b/docs/benchmark/benchmark_tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..60341762b70772bc46196b836050714b9d43228b
--- /dev/null
+++ b/docs/benchmark/benchmark_tools.md
@@ -0,0 +1,187 @@
+# Benchmark 测试方法
+
+本文将会介绍，在**Ubuntu:16.04交叉编译环境**下，用安卓手机在终端测试Paddle-Lite的性能，并介绍两种Benchmark方法：
+
+1. **一键Benchmark**：适用于想快速获得常见模型性能的用户，下载预编译好的benchmark可执行文件；
+2. **逐步Benchmark**：将**一键Benchmark**流程拆解讲解。
+
+## 环境准备
+
+1. 准备[adb](https://developer.android.com/studio/command-line/adb)等必备软件：
+```shell
+sudo apt update
+sudo apt install -y wget adb
+```
+2. 检查手机与电脑连接。安卓手机USB连上电脑，打开设置 -> 开启开发者模式 -> 开启USB调试 -> 允许（授权）当前电脑调试手机；
+3. 在电脑终端输入`adb devices`命令，查看当前连接到的设备：
+```shell
+adb devices
+```
+命令成功执行，显示结果类似下面（序列码略有不同）：
+```shell
+List of devices attached
+712QSDSEMMS7C   device
+```
+
+## 一. 一键Benchmark
+
+执行以下命令，完成Benchmark：
+
+```shell
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/run_benchmark.sh
+sh run_benchmark.sh
+```
+
+该`run_benchmark.sh`脚本会：
+
+1. 下载模型，并上传手机：包含mobilenetv1/v2、shufflenetv2、squeezenetv1.1、mnasnet；
+2. 下载pre-built android-armv7和android-armv8的可执行文件，并上传手机：`benchmark_bin_v7`和`benchmark_bin_v8`；
+3. 自动执行另一个脚本`benchmark.sh`（多台手机连接USB，请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`）；
+4. 从手机下载benchmark结果`result_armv7.txt`和`result_armv8.txt`，到当前目录，并显示Benchmark结果。
+
+## 二. 逐步Benchmark
+
+### 1. 获取benchmark可执行文件
+
+benchmark_bin文件可以测试PaddleLite的性能，有下面两种方式获得。
+
+#### 方式一：下载benchmark_bin可执行文件
+
+```shell
+# Download benchmark_bin for android-armv7
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v7
+
+# Download benchmark_bin for android-armv8
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v8
+```
+
+#### 方式二：由源码编译benchmark_bin文件
+
+根据[源码编译](../source_compile)准备编译环境，拉取PaddleLite最新release发布版代码，并在仓库根目录下，执行：
+
+```shell
+###########################################
+# Build benchmark_bin for android-armv7   #
+###########################################
+./lite/tools/ci_build.sh  \
+  --arm_os="android" \
+  --arm_abi="armv7" \
+  --arm_lang="gcc " \
+  build_arm
+
+# `benchmark_bin` 在: <paddle-lite-repo>/build.lite.android.armv7.gcc/lite/api/benchmark_bin
+
+###########################################
+# Build benchmark_bin for android-armv8   #
+###########################################
+./lite/tools/ci_build.sh  \
+  --arm_os="android" \
+  --arm_abi="armv8" \
+  --arm_lang="gcc "  \
+  build_arm
+
+# `benchmark_bin` 在: <paddle-lite-repo>/build.lite.android.armv8.gcc/lite/api/benchmark_bin
+```
+
+> **注意**：为了避免在docker内部访问不到手机的问题，建议编译得到benchmark_bin后退出到docker外面，并且将benchmark_bin文件拷贝到一个临时目录。然后在该临时目录下，按照下面步骤下载模型、拷贝脚本、测试。
+
+### 2. 准备模型
+
+PaddleLite为Benchmark准备好了[常见Benchmark模型](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_models.tgz)。
+
+执行以下命令，下载常见Benchmark模型并解压：
+
+```shell
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_models.tgz
+tar zxvf benchmark_models.tgz
+```
+
+如果测试其他模型，请将模型文件放到 `benchmark_models` 文件夹中。
+
+### 3. benchmark.sh脚本
+
+benchmark测试的执行脚本`benchmark.sh` 位于源码中的`/PaddleLite/lite/tools/benchmark.sh`位置，测试时需要将`benchmark.sh`、 `benchmark_bin` 、 `benchmark_models` 文件复制到同一目录下。
+
+### 4. 测试
+
+从终端进入benchmark.sh、可执行文件（benchmark_bin_v7、benchmark_bin_v8）和模型文件（benchmark_models）所在文件夹。
+
+如果 `benchmark_models` 中所有模型文件都已经使用 `model_optimize_tool` 进行转换，则使用 benchmark.sh 脚本执行如下命令进行测试：
+
+```shell
+# Benchmark for android-armv7
+sh benchmark.sh ./benchmark_bin_v7 ./benchmark_models result_armv7.txt
+
+# Benchmark for android-armv8
+sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt
+```
+
+如果 `benchmark_models` 中所有模型文件都没有使用 `model_optimize_tool` 进行转换，则执行下面的命令。`benchmark_bin` 会首先转换模型，然后加载模型进行测试。
+
+```shell
+# Benchmark for android-armv7
+sh benchmark.sh ./benchmark_bin_v7 ./benchmark_models result_armv7.txt true
+
+# Benchmark for android-armv8
+sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true
+```
+
+测试结束后，armv7和armv8的结果，分别保存在当前目录下的`result_armv7.txt`和`result_armv8.txt`文件中。
+
+**查看测试结果**
+
+在当前目录的`result_armv7.txt`和`result_armv8.txt`文件，查看测试结果。
+
+> 不同手机，不同版本，测试模型的性能数据不同。
+
+```shell
+run benchmark armv7
+--------------------------------------
+PaddleLite Benchmark
+Threads=1 Warmup=10 Repeats=30
+-- mnasnet               avg = 159.8427 ms
+-- mobilenet_v1          avg = 235.0072 ms
+-- mobilenet_v2          avg = 173.0387 ms
+-- shufflenet_v2         avg = 76.0040 ms
+-- squeezenet_v11        avg = 164.2957 ms
+
+Threads=2 Warmup=10 Repeats=30
+-- mnasnet               avg = 83.1287 ms
+-- mobilenet_v1          avg = 121.6029 ms
+-- mobilenet_v2          avg = 86.6175 ms
+-- shufflenet_v2         avg = 41.5761 ms
+-- squeezenet_v11        avg = 87.8678 ms
+
+Threads=4 Warmup=10 Repeats=30
+-- mnasnet               avg = 73.3880 ms
+-- mobilenet_v1          avg = 119.0739 ms
+-- mobilenet_v2          avg = 85.3050 ms
+-- shufflenet_v2         avg = 38.0762 ms
+-- squeezenet_v11        avg = 64.2201 ms
+--------------------------------------
+
+run benchmark armv8
+--------------------------------------
+PaddleLite Benchmark
+Threads=1 Warmup=10 Repeats=30
+-- mnasnet               avg = 165.3073 ms
+-- mobilenet_v1          avg = 306.0188 ms
+-- mobilenet_v2          avg = 195.1884 ms
+-- shufflenet_v2         avg = 99.3692 ms
+-- squeezenet_v11        avg = 156.6971 ms
+
+Threads=2 Warmup=10 Repeats=30
+-- mnasnet               avg = 90.2290 ms
+-- mobilenet_v1          avg = 157.0007 ms
+-- mobilenet_v2          avg = 118.1607 ms
+-- shufflenet_v2         avg = 68.6804 ms
+-- squeezenet_v11        avg = 91.3090 ms
+
+Threads=4 Warmup=10 Repeats=30
+-- mnasnet               avg = 179.9730 ms
+-- mobilenet_v1          avg = 204.0684 ms
+-- mobilenet_v2          avg = 181.6486 ms
+-- shufflenet_v2         avg = 123.2728 ms
+-- squeezenet_v11        avg = 412.9046 ms
+--------------------------------------
+```
diff --git a/docs/benchmark/index.rst b/docs/benchmark/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae8548e32056a8a824c11f6a622e91c4a6c7da2c
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,174 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+#sys.path.insert(0, os.path.abspath('.'))
+
+import sphinx_rtd_theme
+from recommonmark.parser import CommonMarkParser
+from recommonmark.transform import AutoStructify
+
+# -- Project information -----------------------------------------------------
+
+project = u'Paddle-Lite'
+copyright = u'2020, Paddle-Lite Developer'
+author = u'Paddle-Lite Developer'
+
+# The short X.Y version
+version = u'latest'
+# The full version, including alpha/beta/rc tags
+release = u''
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['recommonmark', 'sphinx_markdown_tables']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = ['.rst', '.md']
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Paddle-Litedoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'Paddle-Lite.tex', u'Paddle-Lite Documentation',
+     u'Paddle-Lite Developer', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'paddle-lite', u'Paddle-Lite Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Paddle-Lite', u'Paddle-Lite Documentation',
+     author, 'Paddle-Lite', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
diff --git a/docs/develop_guides/index.rst b/docs/develop_guides/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/images/architecture.png b/docs/images/architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..35cb336a0640c868d6fc1df738f039a0e7b5884d
Binary files /dev/null and b/docs/images/architecture.png differ
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d7359f1d0508f8e85824f450ca07f095d047f90c
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,71 @@
+.. Paddle-Lite documentation master file, created by
+   sphinx-quickstart on Thu Feb  6 14:11:30 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to Paddle-Lite's documentation!
+=======================================
+
+.. toctree::
+  :maxdepth: 1
+  :caption: 简介
+  :name: sec-introduction
+
+  introduction/tech_highlights
+  introduction/architecture
+
+.. toctree::
+  :maxdepth: 1
+  :caption: Benchmark数据和方法
+  :name: sec-benchmark
+  
+  benchmark/benchmark
+  benchmark/benchmark_tools
+
+.. toctree::
+  :maxdepth: 1
+  :caption: 安装
+  :name: sec-install
+
+  installation/source_compile
+
+.. toctree::
+  :maxdepth: 1
+  :caption: 使用指南
+  :name: sec-user-guides
+
+  user_guides/model_optimize_tool
+  user_guides/library_tailoring
+  user_guides/cuda
+  user_guides/opencl
+
+.. toctree::
+  :maxdepth: 1
+  :caption: 进阶使用指南
+
+  advanced_user_guides/support_operation_list
+  advanced_user_guides/add_operation
+  advanced_user_guides/add_layout
+  advanced_user_guides/model_quantization
+  advanced_user_guides/add_new_pass
+  advanced_user_guides/x86
+  
+.. toctree::
+  :maxdepth: 1
+  :caption: 开发者文档
+
+.. toctree::
+  :maxdepth: 1
+  :caption: API文档
+
+  api_reference/cxx_api_doc
+
+.. toctree::
+  :maxdepth: 1
+  :caption: FAQ
+
+.. toctree::
+  :maxdepth: 1
+  :caption: paddle-mobile
+
+
diff --git a/docs/installation/library.md b/docs/installation/library.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef2f8fdb18ade439d620b348738cbb752d5bd8b6
--- /dev/null
+++ b/docs/installation/library.md
@@ -0,0 +1,61 @@
+
+# 预测库说明
+
+Paddle-Lite的编译结果为预测库文件（包括静态库和动态库），具体编译过程参考[源码编译](./source_compile)。
+
+Lite预测库分为**基础预测库**和**全量预测库**：基础预测库只打包了基础模型需要的基础算子，预测库体积较小；全量预测库打包了所有的Lite算子，可以支持更多的模型，但是预测库的体积也更大。 编译时由编译选项 `build_extra`(默认为OFF)控制，`--build_extra=OFF`时编译基础预测库，`--build_extra=ON`时编译全量的预测库。
+
+## 基础预测库
+
+### 编译方法
+编译时设置`--build_extra=OFF` (默认值) 或不指定即可编译出基础预测库。例如：
+
+```
+./lite/tools/build.sh  --arm_os=android  --arm_abi=armv8 --arm_lang=gcc  --android_stl=c++_static  tiny_publish
+```
+
+### 基础预测库支持的功能
+
+（1）支持基础CV模型
+
+（2）支持基础的in8量化模型
+
+（3）支持[benchmark测试](../benchmark/benchmark)
+
+
+### 基础预测库支持的基础模型：
+
+1. fluid基础模型（paddle model 提供的基础模型9个）
+
+```
+mobileNetV1     mnasnet     yolov3   ssd_mobilenetv1    shufflenet_v2
+mobileNetV2     resnet50    unet     squeezenet_v11
+```
+
+2. int8量化模型模型
+
+```
+mobilenet_v1   mobilenet_v2   resnet50
+```
+
+### 特点
+  轻量级预测库，体积更小，支持常用的基础模型。
+
+
+
+## 全量预测库
+
+### 编译方法
+编译时设置`--build_extra=ON` 即可编译出全量预测库。例如：
+
+```
+./lite/tools/build.sh  --arm_os=android  --arm_abi=armv8 --arm_lang=gcc  --android_stl=c++_static --build_extra=ON tiny_publish
+```
+### 全量预测库功能
+
+（1） 基础预测库所有功能
+
+（2）支持所有Paddle-Lite中注册的所有算子
+
+### 特点
+  支持更多的硬件平台和算子，可以支持更多模型但体量更大。
diff --git a/docs/installation/source_compile.md b/docs/installation/source_compile.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2016b83188b755eca8daab8a4aa38b25e08c0f1
--- /dev/null
+++ b/docs/installation/source_compile.md
@@ -0,0 +1,415 @@
+
+# 源码编译
+
+Paddle-Lite 提供了移动端的一键源码编译脚本 `lite/tools/build.sh`，编译流程如下：
+
+1. 环境准备（选择其一）：Docker交叉编译环境、Linux交叉编译环境
+2. 编译：调用`build.sh`脚本一键编译
+
+## 一、环境准备
+
+目前支持三种编译的环境：
+
+1. Docker 容器环境，
+2. Linux（推荐 Ubuntu 16.04）环境，
+3. Mac OS 环境。
+
+### 1、 Docker开发环境
+
+[Docker](https://www.docker.com/) 是一个开源的应用容器引擎, 使用沙箱机制创建独立容器，方便运行不同程序。Docker初学者可以参考[Docker使用方法](https://thenewstack.io/docker-station-part-one-essential-docker-concepts-tools-terminology/)正确安装Docker。
+
+#### 准备Docker镜像
+
+有两种方式准备Docker镜像，推荐从Dockerhub直接拉取Docker镜像
+
+```shell
+# 方式一：从Dockerhub直接拉取Docker镜像
+docker pull paddlepaddle/paddle-lite:2.0.0_beta
+
+# 方式二：本地源码编译Docker镜像
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite/lite/tools
+mkdir mobile_image
+cp Dockerfile.mobile mobile_image/Dockerfile
+cd mobile_image
+docker build -t paddlepaddle/paddle-lite .
+
+# 镜像编译成功后，可用`docker images`命令，看到`paddlepaddle/paddle-lite`镜像。
+```
+
+#### 进入Docker容器
+
+在拉取Paddle-Lite仓库代码的上层目录，执行如下代码，进入Docker容器：
+
+```shell
+docker run -it \
+  --name paddlelite_docker \
+  -v $PWD/Paddle-Lite:/Paddle-Lite \
+  --net=host \
+  paddlepaddle/paddle-lite /bin/bash
+```
+
+该命令的含义：将容器命名为`paddlelite_docker`即`<container-name>`，将当前目录下的`Paddle-Lite`文件夹挂载到容器中的`/Paddle-Lite`这个根目录下，并进入容器中。至此，完成Docker环境的准备。
+
+#### Docker常用命令
+
+```shell
+# 退出容器但不停止/关闭容器：键盘同时按住三个键：CTRL + q + p
+
+# 启动停止的容器
+docker start <container-name>
+
+# 从shell进入已启动的容器
+docker attach <container-name>
+
+# 停止正在运行的Docker容器
+docker stop <container-name>
+
+# 重新启动正在运行的Docker容器
+docker restart <container-name>
+
+# 删除Docker容器
+docker rm <container-name>
+```
+
+### 2、Linux 开发环境
+
+#### Android
+
+##### 交叉编译环境要求
+
+- gcc、g++、git、make、wget、python、adb
+- Java environment
+- cmake（建议使用3.10或以上版本）
+- Android NDK (建议ndk-r17c)
+
+##### 具体步骤
+
+安装软件部分以 Ubuntu 为例，其他 Linux 发行版类似。
+
+```shell
+# 1. Install basic software
+apt update
+apt-get install -y --no-install-recommends \
+  gcc g++ git make wget python unzip adb curl
+
+# 2. Prepare Java env.
+apt-get install -y default-jdk
+
+# 3. Install cmake 3.10 or above
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+    tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+    mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \  
+    ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+    ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+
+# 4. Download Android NDK for linux-x86_64
+#     Note: Skip this step if NDK installed
+#     recommand android-ndk-r17c-darwin-x86_64
+#     ref: https://developer.android.com/ndk/downloads
+cd /tmp && curl -O https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
+cd /opt && unzip /tmp/android-ndk-r17c-linux-x86_64.zip
+
+# 5. Add environment ${NDK_ROOT} to `~/.bashrc` 
+echo "export NDK_ROOT=/opt/android-ndk-r17c" >> ~/.bashrc
+source ~/.bashrc
+```
+
+#### ARM Linux
+
+适用于基于 ARMv8 和 ARMv7 架构 CPU 的各种开发板，例如 RK3399，树莓派等，目前支持交叉编译和本地编译两种方式，对于交叉编译方式，在完成目标程序编译后，可通过 scp 方式将程序拷贝到开发板运行。
+
+##### 交叉编译
+
+###### 编译环境要求
+
+- gcc、g++、git、make、wget、python、scp
+- cmake（建议使用3.10或以上版本）
+
+###### 具体步骤
+
+安装软件部分以 Ubuntu 为例，其他 Linux 发行版类似。
+
+```shell
+# 1. Install basic software
+apt update
+apt-get install -y --no-install-recommends \
+  gcc g++ git make wget python unzip
+
+# 2. Install arm gcc toolchains
+apt-get install -y --no-install-recommends \
+  g++-arm-linux-gnueabi gcc-arm-linux-gnueabi \
+  g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf \
+  gcc-aarch64-linux-gnu g++-aarch64-linux-gnu 
+
+# 3. Install cmake 3.10 or above
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+    tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+    mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \  
+    ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+    ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+```
+
+##### 本地编译（直接在RK3399或树莓派上编译）
+
+###### 编译环境要求
+
+- gcc、g++、git、make、wget、python
+- cmake（建议使用3.10或以上版本）
+
+###### 具体步骤
+
+安装软件部分以 Ubuntu 为例，其他 Linux 发行版本类似。
+
+```shell
+# 1. Install basic software
+apt update
+apt-get install -y --no-install-recomends \
+  gcc g++ make wget python unzip
+
+# 2. install cmake 3.10 or above
+wget https://www.cmake.org/files/v3.10/cmake-3.10.3.tar.gz
+tar -zxvf cmake-3.10.3.tar.gz
+cd cmake-3.10.3
+./configure
+make
+sudo make install
+```
+
+之后可通过cmake --version查看cmake是否安装成功。
+
+至此，完成 Linux 交叉编译环境的准备。
+
+### 3、Mac OS 开发环境
+
+#### 交叉编译环境要求
+
+- gcc、git、make、curl、unzip、java
+- cmake（Android编译请使用3.10版本，IOS编译请使用3.15版本）
+- 编译Android: Android NDK (建议ndk-r17c)
+- 编译IOS: XCode(Version 10.1)
+
+#### 具体步骤
+
+```bash
+# 1. Install basic software
+brew install  curl gcc git make unzip wget
+
+# 2. Install cmake: mac上实现IOS编译和Android编译要求的cmake版本不一致,可以根据需求选择安装。
+# （1）在mac环境编译 Paddle-Lite 的Android版本，需要安装cmake 3.10
+#     mkdir /usr/local/Cellar/cmake/ && cd /usr/local/Cellar/cmake/
+#     wget https://cmake.org/files/v3.10/cmake-3.10.2-Darwin-x86_64.tar.gz
+#     tar zxf ./cmake-3.10.2-Darwin-x86_64.tar.gz
+#     mv cmake-3.10.2-Darwin-x86_64/CMake.app/Contents/ ./3.10.2
+#     ln -s /usr/local/Cellar/cmake/3.10.2/bin/cmake /usr/local/bin/cmake
+# （2）在mac环境编译 Paddle-Lite 的IOS版本，需要安装cmake 3.15
+#     mkdir /usr/local/Cellar/cmake/ && cd /usr/local/Cellar/cmake/
+#     cd /usr/local/Cellar/cmake/
+#     wget https://cmake.org/files/v3.15/cmake-3.15.2-Darwin-x86_64.tar.gz
+#     tar zxf ./cmake-3.15.2-Darwin-x86_64.tar.gz
+#     mv cmake-3.15.2-Darwin-x86_64/CMake.app/Contents/ ./3.15.2
+#     ln -s /usr/local/Cellar/cmake/3.15.2/bin/cmake /usr/local/bin/cmake
+
+# 3. Download Android NDK for Mac
+#     recommand android-ndk-r17c-darwin-x86_64
+#     ref: https://developer.android.com/ndk/downloads
+#     Note: Skip this step if NDK installed
+cd ~/Documents && curl -O https://dl.google.com/android/repository/android-ndk-r17c-darwin-x86_64.zip
+cd ~/Library && unzip ~/Documents/android-ndk-r17c-darwin-x86_64.zip
+
+# 4. Add environment ${NDK_ROOT} to `~/.bash_profile` 
+echo "export NDK_ROOT=~/Library/android-ndk-r17c" >> ~/.bash_profile
+source ~/.bash_profile
+
+# 5. Install Java Environment 
+brew cask install java
+
+# 6. 编译IOS需要安装XCode(Version 10.1)，可以在App Store里安装。安装后需要启动一次并执行下面语句。
+# sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
+```
+
+至此，完成 Mac 交叉编译环境的准备。
+
+**注意**: Mac上编译Paddle-Lite的full_publish版本时，Paddle-Lite所在路径中不可以含有中文字符
+
+## 二、编译PaddleLite
+
+### 下载代码
+
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+git checkout <release-version-tag>
+```
+
+### 编译模式与参数
+
+编译脚本`./lite/tools/build.sh`，支持三种编译模式：
+
+| 编译模式 | 介绍 | 适用对象 |
+|:-------:|-----|:-------:|
+| tiny_publish | 编译移动端部署库，无第三方库依赖 | 用户 |
+| full_publish | 编译移动端部署库，有第三方依赖如protobuf、glags等，含有可将模型转换为无需protobuf依赖的naive buffer格式的工具，供tiny_publish库使用 | 用户 |
+| test | 编译指定`arm_os`、`arm_abi`下的移动端单元测试 | 框架开发者 |
+
+编译脚本`./lite/tools/build.sh`，追加参数说明：
+
+|   参数     |     介绍     |     值     |
+|-----------|-------------|-------------|
+| --arm_os   |必选，选择安装平台     | `android`、`ios`、`ios64`、`armlinux` |
+| --arm_abi  |必选，选择编译的arm版本，其中`armv7hf`为ARMLinux编译时选用| `armv8`、`armv7`、`armv7hf`(仅`armlinux`支持) |
+| --arm_lang |arm_os=android时必选，选择编译器 | `gcc`、`clang`(`clang`当前暂不支持) |
+| --android_stl |arm_os=android时必选，选择静态链接STL或动态链接STL | `c++_static`、`c++_shared`|
+| --build_java | 可选，是否编译java预测库（默认为OFF） | `ON`、`OFF` |
+| --build_extra | 可选，是否编译全量预测库（默认为OFF）。详情可参考[预测库说明](./library.html)。 | `ON`、`OFF` |
+| target |必选，选择编译模式，`tiny_publish`为编译移动端部署库、`full_publish`为带依赖的移动端部署库、`test`为移动端单元测试、`ios`为编译ios端`tiny_publish` | `tiny_publish`、`full_publish`、`test`、 `ios` |
+
+### 编译代码
+
+**<font color="orange" >注意</font>**<font color="orange" >：非开发者建议在编译前使用</font>[**“加速第三方依赖库的下载”**](#id22)<font color="orange" >的方法，加速工程中第三方依赖库的下载与编译。 </font>
+
+#### 编译`tiny publish`动态库
+
+##### Android
+```shell
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --build_extra=OFF \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_extra=OFF \
+  tiny_publish
+```
+##### IOS
+```shell
+./lite/tools/build.sh \
+  --arm_os=ios64 \
+  --arm_abi=armv8 \
+  --build_extra=OFF \
+  ios
+```
+**注意：mac环境编译IOS 时，cmake版本需要高于cmake 3.15；mac环境上编译Android时，cmake版本需要设置为cmake 3.10。**
+
+ios tiny publish支持的编译选项：
+
+* `--arm_os`: 可选ios或者ios64
+* `--arm_abi`: 可选armv7和armv8（**注意**：当`arm_os=ios`时只能选择`arm_abi=armv7`，当`arm_os=ios64`时只能选择`arm_abi=armv8`）
+* 如果mac编译过程中报错："Invalid CMAKE_DEVELOPER_ROOT: does not exist", 运行：
+```shell
+sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
+```
+##### ARMLinux
+```shell
+./lite/tools/build.sh \
+  --build_extra=OFF \
+  --arm_os=armlinux \
+  --arm_abi=armv7hf \
+  --arm_lang=gcc \
+  --build_extra=OFF \
+  tiny_publish
+```
+- `--arm_abi`: 树莓派3b使用armv7hf，RK3399使用armv8
+  
+#### 编译`full publish`动态库
+
+##### Android
+```shell
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --build_extra=OFF \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_extra=OFF \
+  full_publish
+```
+##### ARMLinux
+```shell
+./lite/tools/build.sh \
+  --arm_os=armlinux \
+  --arm_abi=armv7hf \
+  --arm_lang=gcc \
+  --build_extra=OFF \
+  full_publish
+```
+- `--arm_abi`: 树莓派3b使用armv7hf，RK3399使用armv8
+  
+### 编译结果说明
+
+**编译最终产物位置**在 `build.lite.xxx.xxx.xxx` 下的 `inference_lite_lib.xxx.xxx` ，如 Android 下 ARMv8 的产物位于`inference_lite_lib.android.armv8`：
+
+![](https://user-images.githubusercontent.com/45189361/65375706-204e8780-dccb-11e9-9816-ab4563ce0963.png)
+
+**目录内容**（可能）如下：
+
+**Full_publish编译结果:**
+
+![](https://user-images.githubusercontent.com/45189361/65375704-19c01000-dccb-11e9-9650-6856c7a5bf82.png)
+
+**Tiny_publish结果:**
+
+![](https://user-images.githubusercontent.com/45189361/65375726-3bb99280-dccb-11e9-9903-8ce255371905.png)
+
+**IOS编译结果:**
+
+![](https://user-images.githubusercontent.com/45189361/65375726-3bb99280-dccb-11e9-9903-8ce255371905.png)
+
+
+
+**具体内容**说明：
+
+1、 `bin`文件夹：可执行工具文件 `paddle_code_generator`、`test_model_bin`
+
+2、 `cxx`文件夹：包含c++的库文件与相应的头文件
+
+- `include`  : 头文件
+- `lib` : 库文件
+  - 打包的静态库文件：
+    - `libpaddle_api_full_bundled.a`  ：包含 full_api 和 light_api 功能的静态库
+    - `libpaddle_api_light_bundled.a` ：只包含 light_api 功能的静态库
+  - 打包的动态态库文件：
+    - `libpaddle_full_api_shared.so` ：包含 full_api 和 light_api 功能的动态库
+    - `libpaddle_light_api_shared.so`：只包含 light_api 功能的动态库
+
+3、 `demo`文件夹：示例 demo ，包含 C++ demo 和  Java demo。
+
+- `cxx`   ： C++示例 demo
+  - `mobile_full` :  full_api 的使用示例
+  - `mobile_light` : light_api的使用示例
+- `java`  ：Java 示例 demo
+  - `android`  : Java的 Android 示例
+
+4、 `java` 文件夹：包含 Jni 的动态库文件与相应的 Jar 包
+
+- `jar` :  `PaddlePredictor.jar`
+- `so`  : Jni动态链接库  `libpaddle_lite_jni.so`
+
+5、 `third_party` 文件夹：第三方库文件`gflags`
+
+**注意：**
+
+1、 只有当`--arm_os=android` 时才会编译出：
+
+- Java库文件与示例：`Java`和`demo/java`
+
+- 动态库文件:`libpaddle_full_api_shared.so`,`libpaddle_light_api_shared.so`
+
+2、 `tiny_publish`编译结果不包括 C++ demo和 C++ 静态库，但提供 C++ 的 light_api 动态库、 Jni 动态库和Java demo
+
+### 加速第三方依赖库的下载
+
+移动端相关编译所需的第三方库均位于 `<PaddleLite>/third-party` 目录下，默认编译过程中，会利用`git submodule update --init --recursive`链上相关的第三方依赖的仓库。
+
+为加速`full_publish`、`test`编译模式中对`protobuf`等第三方依赖的下载，`build.sh` 和 `ci_build.sh`支持了从国内 CDN 下载第三方依赖的压缩包。
+
+使用方法：`git clone`完`Paddle-Lite`仓库代码后，手动删除本地仓库根目录下的`third-party`目录：
+
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+git checkout <release-version-tag>
+cd Paddle-Lite
+rm -rf third-party
+```
+
+之后再根据本文档，进行后续编译时，便会忽略第三方依赖对应的`submodule`，改为下载第三方压缩包。
diff --git a/docs/introduction/architecture.md b/docs/introduction/architecture.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a94494af0b44a03988266d341be5788c46f96c2
--- /dev/null
+++ b/docs/introduction/architecture.md
@@ -0,0 +1,94 @@
+# 架构设计
+
+Mobile 在这次升级为 Lite 架构， 侧重多硬件、高性能的支持，其主要设计思想如下
+
+- 引入 Type system，强化多硬件、量化方法、data layout 的混合调度能力
+- 硬件细节隔离，通过不同编译开关，对支持的任何硬件可以自由插拔
+- 引入 MIR(Machine IR) 的概念，强化带执行环境下的优化支持
+- 优化期和执行期严格隔离，保证预测时轻量和高效率
+
+架构图如下
+
+![Paddle Inference Refactor1.0](https://user-images.githubusercontent.com/52520497/64949619-26e49580-d8ac-11e9-855a-514feb9b75af.png)
+
+## 编译期和执行期严格隔离设计
+
+- compile time 优化完毕可以将优化信息存储到模型中；execution time 载入并执行
+- 两套 API 及对应的预测lib，满足不同场景
+  - `CxxPredictor` 打包了 `Compile Time` 和 `Execution Time`，可以 runtime 在具体硬件上做分析和优化，得到最优效果
+  - `MobilePredictor` 只打包 `Execution Time`，保持部署和执行的轻量
+
+## `Execution Time` 轻量级设计和实现
+
+- 每个 batch 实际执行只包含两个步骤执行
+  - `Op.InferShape`
+  - `Kernel.Run`，Kernel 相关参数均使用指针提前确定，后续无查找或传参消耗
+  - 设计目标，执行时，只有 kernel 计算本身消耗
+- 轻量级 `Op` 及 `Kernel` 设计，避免框架额外消耗
+  - `Op` 只有 `CreateKernels` 和 `InferShape` 两个重要职能
+  - `Kernel` 只有 `Run` 职能
+
+## 多硬件后端支持
+
+- 硬件通用行为，使用 `TargetWrapper` 模块做适配器适配，对上层框架提供一致界面
+- 框架上层策略保持硬件无关，如存储优化 (Memory optimize)，计算剪枝 (Computation prune) 等，任何硬件接入均可直接复用
+- 框架支持了硬件通用行为，特定硬件细节不做过多约束，各硬件可以自行实现并接入框架
+- 计算模式上目前支持两种主流模型，一种是类似 X86, ARM CPU 等非异构设备；一种是 GPU，或 FPGA 等异构设备（支持 stream, event异步执行模式以及跨设备拷贝）
+
+---
+## 多硬件及算法混合调度支持
+`TensorTy` 用来表示 Tensor 类型
+
+```c++
+struct TensorTy {
+    TargetType target;
+    PrecisionType precision;
+    DataLayout layout;
+    int deviceid;
+};
+```
+
+```c++
+enum class TargetType { kARM, kX86, kCUDA, kOpenCL };
+enum class PrecisionType { kFP32, kFP16, kInt8, kInt16 };
+enum class DataLayout { kNCHW, kNHWC };
+```
+---
+
+注册 Kernel，确定特定 Kernel 的输入输出特征
+
+```c++
+REGISTER_LITE_KERNEL(
+  mul, kARM, kFloat, kNCHW, arm::MulCompute, def)
+  .BindInput("X", {LiteType::GetTensorTy(kARM, kFloat, kNCHW)})
+  .BindInput("Y", {LiteType::GetTensorTy(kARM, kFloat, kNCHW))})
+  .BindOutput("Out", {LiteType::GetTensorTy(kARM, kFloat, kNCHW)})
+  .Finalize();
+```
+
+---
+
+同一个 Op 的不同 Kernel 类似函数重载
+
+用于支持任意的混合调度：
+
+1. 标记模型中所有 tensor 的 Type
+2. 标记 Kernel 的 硬件、执行精度、data layout 等信息
+
+全局做类型推断，当发现 tensor 传递中有类型冲突，采用 type cast 操作，通过插入特定功能 Op 来实现正确的传导
+
+![lite-7](https://user-images.githubusercontent.com/52520497/64949642-395ecf00-d8ac-11e9-8b69-ced1996abc3b.png)
+
+
+
+---
+
+## MIR 用于图分析优化
+
+基于 Type System 的 SSA，通过 IR Pass 对计算图进行分析和优化：
+
+- 支持对整个 graph 进行类型推断，发现类型冲突并加入 type cast op，来支持通用混合调度
+- 计算剪枝 (Compute prune)，比如去掉 scale(1), assign op 等
+- 存储优化 (Memory optimize)
+- 操作熔合 (Operator fuse)（已经支持 fc, conv_bn, ele_add+act 等6种 fuse 策略）
+- 支持量化处理（已支持 Int8预测）
diff --git a/docs/introduction/tech_highlights.md b/docs/introduction/tech_highlights.md
new file mode 100644
index 0000000000000000000000000000000000000000..83618aaa4bcbd9b7383782d193580e1d3dec7143
--- /dev/null
+++ b/docs/introduction/tech_highlights.md
@@ -0,0 +1,44 @@
+# 技术特点
+
+不同于普通的移动端预测基于类 Caffe 的架构，Lite 架构最早的设计目标来源于 Paddle Server 和 Mobile 两种场景的要求，其中 Server 端需要有完善的图分析和优化能力，而 Mobile 端要求有轻量级部署的能力，两种场景共同的要求是高性能，多硬件支持等。
+
+基于上述要求，Lite 架构完整实现了相应的能力，重点描述如下。
+
+## 多硬件支持
+
+Lite 架构已经验证和完整支持从 Mobile 到 Server 多种硬件的支持需求，包括 ARM CPU, ARM GPU, Huawei NPU, Intel X86 CPU, NV GPU 等。 得益于对不同硬件适度的抽象，在Lite 框架本身清晰的同时支持不同硬件的特殊调度需求，使得Lite架构在框架的清晰程度和硬件的特定调度优化上达到很好的平衡，比如 Nvidia GPU 上复杂的 stream, event 分配，在 Lite 中可以清晰表示。
+
+多种硬件的 Kernel 在代码层和执行层均互不干扰，用户可以自由插拔任何硬件的支持。
+
+## 高性能
+
+高性能来源于两方面，一是 Kernel 优化；二是框架执行。
+
+Kernel 方面，我们对相应硬件上的 Kernel 通过指令集、操作熔合、算法改写等方式进行了深入优化。
+
+框架执行方面，通过简化 Op 和 Kernel 的功能，使得执行期的框架开销极低；此外，框架极大的灵活性可以支持各种硬件的特定调度优化以提升整体效率。
+
+## 量化支持
+
+Lite 支持Paddle Slim 强大的量化训练完毕的模型，因此完整保留了量化计算的高性能以及量化训练的高精度。
+
+## 强大的图分析和优化能力
+
+在图分析优化上，不同于常规的移动端预测引擎基于 Python 脚本工具转化模型， Lite 架构上有完整基于 C++ 开发的 IR 及相应 Pass 集合，以支持操作熔合 (Operator fusion)，计算剪枝 (Computation pruning)，存储优化 (Memory optimization)，量化计算 (Quantitative computation)  等多类计算图优化。
+
+更多的优化策略可以简单通过添加 Pass 的方式模块化支持。 
+
+## 轻量级部署
+
+尽管图优化上有复杂的策略，但并不影响移动端的轻量级部署，图分析模块和最终的执行引擎可以拆开使用，最终部署只有一层薄薄的 Kernel 。
+
+## 可支持任意硬件的混合调度
+
+Lite 支持系统可见任意硬件的混合调度，目前已经支持 ARM CPU  和  ARM GPU 的 Kernel 自动混合调度，并验证了 X86 CPU 和 Nvidia GPU 间的混合调度。
+
+支持混合调度的考量有两点：
+
+1. 当系统内同时存在多种硬件可用时，混合调度可以充分利用各类硬件资源
+2. 随着支持模型的增多，各硬件对kernel的支持丰富度不一，难免需要混合调度才能跑通
+
+Lite架构通过从底层支持 `Type system`  的方式通用建模各类混合执行的行为，从而能够相对完备地支持混调。
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000000000000000000000000000000000000..7893348a1b7dbb588983a48e6991282eae7e1b55
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/docs/paddle_mobile/index.rst b/docs/paddle_mobile/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f11fa32f6f465f7b002d7fd37cbd78203206d8d7
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,4 @@
+sphinx
+recommonmark
+sphinx_markdown_tables
+sphinx_rtd_theme
diff --git a/docs/user_guides/cuda.md b/docs/user_guides/cuda.md
new file mode 100644
index 0000000000000000000000000000000000000000..45597057bb18c44b60234459f9a49a59b54135f6
--- /dev/null
+++ b/docs/user_guides/cuda.md
@@ -0,0 +1,110 @@
+# Lite基于CUDA的模型预测
+
+Lite支持在x86_64，arm64架构上（如：TX2）进行CUDA的编译运行。
+
+## 编译
+
+**NOTE：** 如果是在TX2等NVIDIA嵌入式硬件上编译，请使用最新的[Jetpack](https://developer.nvidia.com/embedded/jetpack) 安装依赖库。
+
+
+一： 下载代码
+
+```
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+```
+
+二：编译
+
+```
+# 进入代码目录
+cd Paddle-Lite
+
+# 运行编译脚本
+# 编译结束会在本目录下生成 build_cuda 目录
+# 编译过程中如果提示找不到CUDA，CUDNN，请在环境变量设置CUDA_TOOLKIT_ROOT_DIR, CUDNN_ROOT
+# CUDA_TOOLKIT_ROOT_DIR，CUDNN_ROOT分别表示CUDA，CUDNN的根目录
+./lite/tools/build.sh cuda
+# 如果使用python接口，需要打开build_python选项
+./lite/tools/build.sh --build_python=ON cuda
+```
+
+编译结束会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。
+
+## 运行
+
+以下以Yolov3模型为例，介绍如何在Nvidia GPU硬件上运行模型。
+
+一： 下载darknet_yolov3模型，模型信息请参考[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/yolov3)
+
+
+```
+# 下载模型
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz
+tar -zxf yolov3_infer.tar.gz
+# 下载图片样例
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
+```
+
+二： 运行   
+
+**NOTE:**此处示例使用的是python接口，后续会开放C++接口以及示例。
+
+``` python
+#-*- coding: utf-8 -*-
+from __future__ import print_function
+import sys
+import numpy as np
+import cv2
+sys.path.append('build_cuda/inference_lite_lib/python/lib')
+from lite_core import *
+
+def read_img(im_path, resize_h, resize_w):
+  im = cv2.imread(im_path).astype('float32')
+  im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+  h, w, _ = im.shape
+  im_scale_x = resize_h / float(w)
+  im_scale_y = resize_w / float(h)
+  out_img = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=cv2.INTER_CUBIC)
+  mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, -1))
+  std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, -1))
+  out_img = (out_img / 255.0 - mean) / std
+  out_img = out_img.transpose((2, 0, 1))
+  return out_img
+
+# 配置config
+a = CxxConfig()
+a.set_model_file('./yolov3_infer/__model__') # 指定模型文件路径 
+a.set_param_file('./yolov3_infer/__params__') # 指定参数文件路径
+place_cuda = Place(TargetType.CUDA)
+a.set_valid_places([place_cuda])
+
+# 创建predictor
+predictor = create_paddle_predictor(a)
+
+# 设置输入
+input_tensor = predictor.get_input(0);
+height, width = 608, 608
+input_tensor.resize([1, 3, height, width])
+data = read_img('./kite.jpg', height, width).flatten()
+input_tensor.set_float_data(data, TargetType.CUDA)
+
+in2 = predictor.get_input(1);
+in2.resize([1, 2])
+in2.set_int32_data([height, width], TargetType.CUDA)
+
+# 运行
+predictor.run()
+
+# 获取输出
+output_tensor = predictor.get_output(0);
+
+print (output_tensor.shape())
+# [100L, 6L]
+print (output_tensor.target())
+# TargetType.Host
+print (output_tensor.float_data()[:6])
+# [0.0, 0.9862784743309021, 98.51927185058594, 471.2381286621094, 120.73092651367188, 578.33251953125]
+
+```
+
+**NOTE：** 对CUDA的支持还在持续开发中。
diff --git a/docs/user_guides/index.rst b/docs/user_guides/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/user_guides/library_tailoring.md b/docs/user_guides/library_tailoring.md
new file mode 100644
index 0000000000000000000000000000000000000000..5ba12cf819945ab2f182f672a2c96123bc12e070
--- /dev/null
+++ b/docs/user_guides/library_tailoring.md
@@ -0,0 +1,185 @@
+
+# 裁剪预测库方法
+
+Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编译会将所有已注册的operator打包到预测库中，造成库文件体积膨胀；**裁剪预测库**能针对具体的模型，只打包优化后该模型需要的operator，有效降低预测库文件大小。
+
+## 效果展示(Tiny_publish Android动态预测库体积)
+
+| 测试模型 | 裁剪开关  | **libpaddle_lite_jni.so** |转化后模型中的OP|
+| ------------------ | ---------------------------- | -------- |------------------|
+| mobilenetv1（armv8） | 裁剪前--build_tailor=OFF | 1.5M                | feed,etch,conv2d,depthwise_conv2d,fc,fpool2d,softmax     |
+| mobilenetv1（armv8） | 裁剪后--build_tailor=ON  |  788K              |feed,etch,conv2d,depthwise_conv2d,fc,fpool2d,softmax|
+| mobilenetv2（armv8） | 裁剪前--build_tailor=OFF  | 1.5M                | feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax |
+| mobilenetv2（armv8） | 裁剪后--build_tailor=ON  |  912K          |feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax|
+| mobilenetv1（armv7） | 裁剪前--build_tailor=OFF    | 938K     |feed,fetch,concat,conv2d,dropout,fc,pool2d,softmax|
+| mobilenetv1（armv7） | 裁剪后--build_tailor=ON  | 607K   |feed,fetch,concat,conv2d,dropout,fc,pool2d,softmax|
+| mobilenetv2（armv7） | 裁剪前--build_tailor=OFF     | 938K | feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax |
+| mobilenetv2（armv7） | 裁剪后--build_tailor=ON  |687K          |feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax|
+
+
+
+
+## 实现过程：
+
+
+### 1、转化模型时记录优化后模型信息
+
+说明：使用model_optimize_tool转化模型时，选择 `--record_tailoring_info =true`  会将优化后模型的OP和kernel信息保存到输出文件夹，这些信息将用于编译裁剪后的动态库。
+注意：需要使用Paddle-Lite 最新版本（release/v2.0.0之后）代码编译出的model_optimize_tool
+例如：
+
+```bash
+./model_optimize_tool     --model_dir=./mobilenet_v1     --optimize_out_type=naive_buffer     --optimize_out=mobilenet_v1NB     --record_tailoring_info =true     --valid_targets=arm
+```
+效果：优化后模型使用的OP和kernel信息被保存在 `mobilenet_v1NB`文件夹中的隐藏文件里了
+
+### 2、根据模型信息编译裁剪后的预测库
+
+说明：编译Paddle-Lite时选择`--build_tailor=ON` ，并且用   `–-opt_model_dir=`   指定优化后的模型的地址
+例如：
+
+```bash
+./lite/tools/build.sh   --arm_os=android   --arm_abi=armv7   --arm_lang=gcc   --android_stl=c++_static   --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish
+```
+**注意**：上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径
+
+**效果**：编译出来的动态库文件变小，且可以运行优化后的模型。
+
+编译出的C++预测库文件位于  ：
+
+`build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/`
+
+编译出的Java预测库文件位于：
+
+`build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/java/so/`
+
+### 3、运行裁剪后的预测库文件
+
+注意：基于某一模型裁剪出的预测库只能支持优化工具转化后的该模型，例如根据mobilenetV1裁剪出的 full_api预测库只能运行以protobuf格式转化出的模型mobilenetV1_opt_nb， 裁剪出的light_api预测库只能运行以naive_buffer格式转化出的模型mobilenetV1_opt_nb， 运行其他模型可能会出现`segementation fault:undifined op or kernel`。  模型转化方法参考：[使用opt转化模型](./model_optimize_tool))。
+
+
+
+**示例1**：使用裁剪后的light_api预测库运行mobilenetv1
+
+1、执行第二步编译后，light_api的C++ 示例位于
+
+`/Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/demo/cxx/mobile_light`
+
+输入`make`命令执行编译可编译出可执行文件mobilenetv1_light_api
+
+2、使用adb将mobilenetV1_NB模型和mobilenetv1_light_api传到手机后执行demo：
+
+`./mobilenetv1_light_api --model_dir=./mobilenetV1_NB`
+
+注意：`mobilenetV1_NB`是用`mobilenetV1`模型转化的naive_buffer格式模型(不需要设置` --record_tailoring_info =true`，转化流程参考：[使用opt转化模型](./model_optimize_tool))。
+
+
+
+**示例2**：使用裁剪后的full_api预测库运行mobilenetv1
+
+1、执行第二步编译后，full_api的C++ 示例位于
+
+`/Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/demo/cxx/mobile_light`
+
+替换mobilenetv1_full_api.cc代码内容：
+
+```C++
+#include <gflags/gflags.h>
+#include <stdio.h>
+#include <vector>
+#include "paddle_api.h"          // NOLINT
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_use_passes.h"   // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+DEFINE_string(model_dir, "", "Model dir path.");
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+void RunModel() {
+  // 1. Set CxxConfig
+  CxxConfig config;
+  config.set_model_file(FLAGS_model_dir + "model");
+  config.set_param_file(FLAGS_model_dir + "params");
+
+  std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
+  config.set_valid_places(valid_places);
+
+  // 2. Create PaddlePredictor by CxxConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<CxxConfig>(config);
+
+  // 3. Prepare input data
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(shape_t({1, 3, 224, 224}));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = 1;
+  }
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  printf("Output dim: %d\n", output_tensor->shape()[1]);
+  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+  }
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  RunModel();
+  return 0;
+}
+
+```
+
+2、使用adb将mobilenetV1_PB模型和mobilenetv1_full_api传到手机后执行demo：
+
+`./mobilenetv1_full_api --model_dir=./mobilenetV1_PB`
+
+注意：`mobilenetV1_PB`是用`mobilenetV1`模型转化的protobuf格式模型(不需要设置` --record_tailoring_info =true`，转化流程参考：[使用opt转化模型](./model_optimize_tool))。
+
+## 按模型集合裁剪预测库
+
+为了方便用户使用，我们同时提供了按模型集合进行预测库裁剪的功能。用户可以提供一个模型集合，Model Optimize Tool会根据用户所指定的模型集合分析其**优化后的**模型所需要的算子信息对预测库进行裁剪。使用此功能用户根据自己的需要使用模型集合来对预测库中的算子进行任意裁剪。
+
+使用方法如下所示：
+
+```shell
+# 非combined模型集合
+./model_optimize_tool                     \
+    --model_set_dir=<your_model_set_dir>  \
+    --optimize_out_type=naive_buffer      \
+    --optimize_out=<output_model_set_dir> \
+    --record_tailoring_info=true          \
+    --valid_targets=arm
+   
+# combined模型集合
+./model_optimize_tool                       \
+    --model_set_dir=<your_model_set_dir>    \
+    --optimize_out_type=naive_buffer        \
+    --model_filename=<model_topo_filename>  \
+    --param_filename=<model_param_filename> \
+    --optimize_out=<output_model_set_dir>   \
+    --record_tailoring_info=true            \
+    --valid_targets=arm
+```
+
+经过以上步骤后会在`<output_model_set_dir>`中生成模型集合中各模型对应的NaiveBuffer格式的优化模型。此步会对模型集合中所需算子信息进行搜集并存储到`<output_model_set_dir>`中。下一步编译预测库的流程与使用单模型进行预测库裁剪步骤相同。
+
+**注意：**
+
+1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。
+2. 使用非combined参数模型时，模型拓扑文件名应为`__model__`，使用非combined参数模型时，集合中各模型的拓扑与参数名应相同，分别由`--model_filename`和`--param_filename`指定。
+3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。
+4. 需要使用Paddle-Lite 最新版本（release/v2.1.0之后）代码编译出的model_optimize_tool。
diff --git a/docs/user_guides/model_optimize_tool.md b/docs/user_guides/model_optimize_tool.md
new file mode 100644
index 0000000000000000000000000000000000000000..fccc6d8b23c78474257d11399d121816f57fc422
--- /dev/null
+++ b/docs/user_guides/model_optimize_tool.md
@@ -0,0 +1,161 @@
+
+# 模型转化方法
+
+Lite架构在预测过程中表现出来的高性能得益于其丰富的优化组件，其中包括量化、子图融合、混合调度、Kernel优选等等策略。为了使优化过程更加方便易用，我们提供了**opt**来自动完成优化步骤，输出一个轻量的、最优的可执行模型。具体使用方法介绍如下：
+
+**注意**：release/v2.2.0之前的模型转化工具名称为`model_optimize_tool`，从release/v2.3开始模型转化工具名称修改为`opt`
+
+## 准备opt
+当前获得opt方法有三种：
+
+1. 我们提供当前develop分支编译结果下载：[opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt)、[opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac)
+release/v2.2.0之前版本的model_optimize_tool: [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool)、[model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac)
+
+2. 可以进入Paddle-Lite Github仓库的[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases)，选择release版本下载对应的转化工具`opt`    
+   (release/v2.2.0之前的转化工具为model_optimize_tool、release/v2.3.0之后为opt)
+
+3. 可以下载Paddle-Lite源码，从源码编译出opt工具
+```bash
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+git checkout <release-version-tag>
+./lite/tools/build.sh build_optimize_tool
+```
+编译结果位于`Paddle-Lite/build.opt/lite/api/opt`
+**注意**：从源码编译opt前需要先[安装Paddle-Lite的开发环境](../installation/source_compile)。
+
+## 使用opt
+
+opt是x86平台上的可执行文件，需要在PC端运行：包括Linux终端和Mac终端。
+
+### 帮助信息
+ 执行opt时不加入任何输入选项，会输出帮助信息，提示当前支持的选项：
+```bash
+ ./opt
+```
+![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png)
+
+### 功能一：转化模型为Paddle-Lite格式
+opt可以将PaddlePaddle支持的模型转化为Paddle-Lite支持的模型格式，期间执行的操作包括：将protobuf格式的模型文件转化为naive_buffer格式的模型文件，有效降低模型体积；执行“量化、子图融合、混合调度、Kernel优选”等图优化操作，提升其在Paddle-Lite上的运行速度、内存占用等性能指标。
+
+模型优化过程：
+
+（1）准备待优化的PaddlePaddle模型
+
+PaddlePaddle模型有两种保存格式：
+   Combined Param：所有参数信息保存在单个文件`params`中，模型的拓扑信息保存在`__model__`文件中。
+
+![opt_combined_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fcombined_model.png)
+
+   Seperated Param：参数信息分开保存在多个参数文件中，模型的拓扑信息保存在`__model__`文件中。
+![opt_seperated_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fseperated_model.png)
+
+(2) 终端中执行`opt`优化模型
+**使用示例**：转化`mobilenet_v1`模型
+
+```
+./opt --model_dir=./mobilenet_v1 --valid_targets=arm --optimize_out_type=naive_buffer --optimize_out=mobilenet_v1_opt
+```
+以上命令可以将`mobilenet_v1`模型转化为arm硬件平台、naive_buffer格式的Paddle_Lite支持模型，优化后的模型文件为`mobilenet_v1_opt.nb`，转化结果如下图所示：
+
+![opt_resulted_model](https://paddlelite-data.bj.bcebos.com/doc_images/2.png)
+
+
+(3) **更详尽的转化命令**总结：
+
+```shell
+./opt \
+    --model_dir=<model_param_dir> \
+    --model_file=<model_path> \
+    --param_file=<param_path> \
+    --optimize_out_type=(protobuf|naive_buffer) \
+    --optimize_out=<output_optimize_model_dir> \
+    --valid_targets=(arm|opencl|x86|npu|xpu) \
+    --prefer_int8_kernel=(true|false) \
+    --record_tailoring_info =(true|false)
+```
+
+| 选项         | 说明 |
+| ------------------- | ------------------------------------------------------------ |
+| --model_dir         | 待优化的PaddlePaddle模型（非combined形式）的路径 |
+| --model_file        | 待优化的PaddlePaddle模型（combined形式）的网络结构文件路径。 |
+| --param_file        | 待优化的PaddlePaddle模型（combined形式）的权重文件路径。 |
+| --optimize_out_type | 输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测，请将此选项设置为naive_buffer。默认为protobuf。 |
+| --optimize_out      | 优化模型的输出路径。                                         |
+| --valid_targets     | 指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm。 |
+| --prefer_int8_kernel | 若待优化模型为int8量化模型（如量化训练得到的量化模型），则设置该选项为true以使用int8内核函数进行推理加速，默认为false。                          |
+| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时，则设置该选项为true，以记录优化后模型含有的kernel和OP信息，默认为false。 |
+
+* 如果待优化的fluid模型是非combined形式，请设置`--model_dir`，忽略`--model_file`和`--param_file`。
+* 如果待优化的fluid模型是combined形式，请设置`--model_file`和`--param_file`，忽略`--model_dir`。
+* 优化后的模型包括__model__.nb和param.nb文件。
+
+### 功能二：统计模型算子信息、判断是否支持
+
+opt可以统计并打印出model中的算子信息、判断Paddle-Lite是否支持该模型。并可以打印出当前Paddle-Lite的算子支持情况。
+
+（1）使用opt统计模型中算子信息
+
+下面命令可以打印出mobilenet_v1模型中包含的所有算子，并判断在硬件平台`valid_targets`下Paddle-Lite是否支持该模型
+
+`./opt --print_model_ops=true  --model_dir=mobilenet_v1 --valid_targets=arm`
+
+![opt_print_modelops](https://paddlelite-data.bj.bcebos.com/doc_images/3.png)
+
+（2）使用opt打印当前Paddle-Lite支持的算子信息
+
+`./opt --print_all_ops=true`
+
+以上命令可以打印出当前Paddle-Lite支持的所有算子信息，包括OP的数量和每个OP支持哪些硬件平台：
+
+![opt_print_allops](https://paddlelite-data.bj.bcebos.com/doc_images/4.png)
+
+`./opt ----print_supported_ops=true  --valid_targets=x86`
+
+以上命令可以打印出当`valid_targets=x86`时Paddle-Lite支持的所有OP：
+
+![opt_print_supportedops](https://paddlelite-data.bj.bcebos.com/doc_images/5.png)
+
+## 其他功能：合并x2paddle和opt的一键脚本
+
+**背景**：如果想用Paddle-Lite运行第三方来源（tensorflow、caffe、onnx）模型，一般需要经过两次转化。即使用x2paddle工具将第三方模型转化为PaddlePaddle格式，再使用opt将PaddlePaddle模型转化为Padde-Lite可支持格式。
+为了简化这一过程，我们提供一键脚本，将x2paddle转化和opt转化合并：
+
+**一键转化脚本**：[auto_transform.sh](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/auto_transform.sh)
+
+
+**环境要求**：使用`auto_transform.sh`脚本转化第三方模型时，需要先安装x2paddle环境，请参考[x2paddle环境安装方法](https://github.com/PaddlePaddle/X2Paddle#环境依赖) 安装x2paddle和其环境依赖项。
+
+**使用方法**：
+
+（1）打印帮助帮助信息：` ./auto_transform.sh`
+
+（2）转化模型方法
+
+```bash
+USAGE:
+    auto_transform.sh combines the function of x2paddle and opt, it can 
+    tranform model from tensorflow/caffe/onnx form into paddle-lite naive-buffer form.
+----------------------------------------
+example:
+    ./auto_transform.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result
+----------------------------------------
+Arguments about x2paddle:
+    --framework=(tensorflow|caffe|onnx);
+    --model='model file for tensorflow or onnx';
+    --prototxt='proto file for caffe' --weight='weight file for caffe'
+ For TensorFlow:
+   --framework=tensorflow --model=tf_model.pb
+
+ For Caffe:
+   --framework=caffe --prototxt=deploy.prototxt --weight=deploy.caffemodel
+
+ For ONNX
+   --framework=onnx --model=onnx_model.onnx
+
+Arguments about opt:
+    --valid_targets=(arm|opencl|x86|npu|xpu); valid targets on Paddle-Lite.
+    --fluid_save_dir='path to outputed model after x2paddle'
+    --optimize_out='path to outputed Paddle-Lite model'
+----------------------------------------
+```
diff --git a/docs/user_guides/opencl.md b/docs/user_guides/opencl.md
new file mode 100644
index 0000000000000000000000000000000000000000..e9533af1ff6e2447a8e4d389df90cdb457f58fb2
--- /dev/null
+++ b/docs/user_guides/opencl.md
@@ -0,0 +1,242 @@
+# Lite基于OpenCL的ARM GPU预测
+
+Lite支持在Android系统上运行基于OpenCL的程序，目前支持Ubuntu环境下armv8、armv7的交叉编译。
+
+## 编译
+
+### 编译环境
+
+1. Docker 容器环境；
+2. Linux（推荐 Ubuntu 16.04）环境。
+
+详见 **源码编译指南-环境准备** 章节。
+
+### 编译选项
+
+|参数|介绍|值|
+|--------|--------|--------|
+|--arm_os|代表目标操作系统|目前仅支持且默认为`android`|
+|--arm_abi|代表体系结构类型，支持armv8和armv7|默认为`armv8`即arm64-v8a；`armv7`即armeabi-v7a|
+|--arm_lang|代表编译目标文件所使用的编译器|默认为gcc，支持 gcc和clang两种|
+
+### 编译Paddle-Lite OpenCL库范例
+
+注：以android-armv8-opencl的目标、Docker容器的编译开发环境为例，CMake3.10，android-ndk-r17c位于`/opt/`目录下。
+
+```bash
+# 假设当前位于处于Lite源码根目录下
+
+# 导入NDK_ROOT变量，注意检查您的安装目录若与本示例不同
+export NDK_ROOT=/opt/android-ndk-r17c
+
+# 删除上一次CMake自动生成的.h文件
+rm ./lite/api/paddle_use_kernels.h
+rm ./lite/api/paddle_use_ops.h
+
+# 根据指定编译参数编译
+./lite/tools/ci_build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  build_test_arm_opencl
+```
+
+编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内，这里仅罗列关键产物：
+
+- `cxx`:该目录是编译目标的C++的头文件和库文件;
+- `demo`:该目录包含了两个demo，用来调用使用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`，分别对应`mobile_full`和`mobile_light`文件夹。编译对应的demo仅需在`mobile_full`或`mobile_light`文
+  - `mobile_full`:使用cxx config，可直接加载fluid模型，若使用OpenCL需要在`mobilenetv1_full_api.cc`代码里开启`DEMO_USE_OPENCL`的宏，详细见代码注释;
+  - `mobile_light`:使用mobile config，只能加载`model_optimize_tool`优化过的模型;
+- `opencl`:该目录存放opencl实现的相关kernel。
+
+```bash
+.
+|-- cxx
+|   |-- include
+|   |   |-- paddle_api.h
+|   |   |-- paddle_image_preprocess.h
+|   |   |-- paddle_lite_factory_helper.h
+|   |   |-- paddle_place.h
+|   |   |-- paddle_use_kernels.h
+|   |   |-- paddle_use_ops.h
+|   |   `-- paddle_use_passes.h
+|   `-- lib
+|       |-- libpaddle_api_full_bundled.a
+|       |-- libpaddle_api_light_bundled.a
+|       |-- libpaddle_full_api_shared.so
+|       `-- libpaddle_light_api_shared.so
+|-- demo
+|   `-- cxx
+|       |-- Makefile.def
+|       |-- README.md
+|       |-- include
+|       |   |-- paddle_api.h
+|       |   |-- paddle_lite_factory_helper.h
+|       |   |-- paddle_place.h
+|       |   |-- paddle_use_kernels.h
+|       |   |-- paddle_use_ops.h
+|       |   `-- paddle_use_passes.h
+|       |-- mobile_full
+|       |   |-- Makefile
+|       |   `-- mobilenetv1_full_api.cc
+|       `-- mobile_light
+|           |-- Makefile
+|           `-- mobilenetv1_light_api.cc
+`-- opencl
+    `-- cl_kernel
+        |-- buffer
+        |   |-- depthwise_conv2d_kernel.cl
+        |   |-- elementwise_add_kernel.cl
+        |   |-- fc_kernel.cl
+        |   |-- im2col_kernel.cl
+        |   |-- layout_kernel.cl
+        |   |-- mat_mul_kernel.cl
+        |   |-- pool_kernel.cl
+        |   `-- relu_kernel.cl
+        |-- cl_common.h
+        `-- image
+            |-- channel_add_kernel.cl
+            |-- elementwise_add_kernel.cl
+            |-- pool_kernel.cl
+            `-- relu_kernel.cl
+```
+
+调用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`见下一部分运行示例。
+
+
+
+## 运行示例
+
+下面以android、ARMv8、gcc的环境为例，介绍3个示例，分别如何在手机上执行基于OpenCL的ARM GPU推理过程。
+
+
+**注意：** 以下命令均在Lite源码根目录下运行。在3个示例前，下面这段命令都先要执行用来准备环境:
+
+```bash
+# 在/data/local/tmp目录下创建OpenCL文件目录
+adb shell mkdir -p /data/local/tmp/opencl
+adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/buffer
+adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/image
+
+# 将OpenCL的kernels文件推送到/data/local/tmp/opencl目录下
+adb push lite/backends/opencl/cl_kernel/cl_common.h /data/local/tmp/opencl/cl_kernel/
+adb push lite/backends/opencl/cl_kernel/buffer/* /data/local/tmp/opencl/cl_kernel/buffer/
+adb push lite/backends/opencl/cl_kernel/image/* /data/local/tmp/opencl/cl_kernel/image/
+```
+
+### 运行示例1: 编译产物demo示例
+
+```bash
+######################################################################
+# 编译mobile_full的demo                                              #
+######################################################################
+# 步骤:                                                              #
+#   0.确保编译Paddle-Lite时编译了OpenCL;                             #
+#   1.编辑`mobilenetv1_full_api.cc`代码, 开启`DEMO_USE_OPENCL`的宏;  #
+#   2.在产物目录`demo/cxx/mobile_full`下编译`mobile_full`的demo;     #
+#   3.上传demo, 模型, opencl kernel文件到手机;                       #
+#   4.运行demo得到预期结果.                                          #
+######################################################################
+adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
+chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api
+adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api /data/local/tmp/opencl/
+adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
+
+# use mobile_full run mobilenet_v1
+# `GLOG_v` is log level
+adb shell "export GLOG_v=0; \
+    /data/local/tmp/opencl/mobilenetv1_full_api \
+    --model_dir=/data/local/tmp/opencl/mobilenet_v1 \
+    --optimized_model_dir=/data/local/tmp/opencl/full_api_opt_model"
+
+
+
+######################################################################
+# 编译mobile_light的demo                                             #
+######################################################################
+# 步骤:                                                              #
+#   0.确保编译Paddle-Lite时编译了OpenCL;                             #
+#   1.编译model_optimize_tool并对模型优化, `targets`参数为`opencl`;  #
+#   2.在产物目录`demo/cxx/mobile_light`下编译`mobile_light`的demo;   #
+#   3.上传demo, 模型, opencl kernel文件到手机;                       #
+#   4.运行demo得到预期结果.                                          #
+######################################################################
+
+# use model_optimize_tool to optimize model
+./build.model_optimize_tool/lite/api/model_optimize_tool \
+  --model_dir=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
+  --optimize_out_type=naive_buffer \
+  --optimize_out=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
+  --valid_targets=opencl
+
+adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
+chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api
+adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp/opencl/
+adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
+
+# use mobile_light run mobilenet_v1
+adb shell "export GLOG_v=5; \
+  /data/local/tmp/opencl/mobilenetv1_light_api \
+  --model_dir=/data/local/tmp/opencl/"
+```
+
+### 运行示例2: test_mobilenetv1单元测试
+
+- **运行文件准备**
+
+```bash
+# 将mobilenet_v1的模型文件推送到/data/local/tmp/opencl目录下
+adb shell mkdir -p /data/local/tmp/opencl/mobilenet_v1
+adb push build.lite.android.armv8.gcc.opencl/third_party/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1/
+
+# 将OpenCL单元测试程序test_mobilenetv1，推送到/data/local/tmp/opencl目录下
+adb push build.lite.android.armv8.gcc.opencl/lite/api/test_mobilenetv1 /data/local/tmp/opencl
+```
+
+- **执行OpenCL推理过程**
+
+使用如下命令运行OpenCL程序。其中：
+
+- `--cl_path`指定了OpenCL的kernels文件即cl\_kernel所在目录；
+- `--modle_dir`指定了模型文件所在目录。
+
+```bash
+adb shell chmod +x /data/local/tmp/opencl/test_mobilenetv1
+
+adb shell /data/local/tmp/opencl/test_mobilenetv1 \
+  --cl_path=/data/local/tmp/opencl \
+  --model_dir=/data/local/tmp/opencl/mobilenet_v1 \
+  --warmup=1 \
+  --repeats=1
+```
+
+**注意：** 因为权重参数均会在Op Kernel第一次运行时进行加载，所以第一次的执行时间会略长。一般将warmup的值设为1，repeats值设为多次。
+
+### 运行示例3: test_layout_opencl单元测试
+
+- **运行文件准备**
+
+```bash
+# 将OpenCL单元测试程序test_layout_opencl，推送到/data/local/tmp/opencl目录下
+adb push build.lite.android.armv8.gcc.opencl/lite/kernels/opencl/test_layout_opencl /data/local/tmp/opencl/
+```
+
+
+OpenCL推理过程**
+
+```bash
+adb shell chmod +x /data/local/tmp/opencl/test_layout_opencl
+adb shell /data/local/tmp/opencl/test_layout_opencl
+```
+
+
+# 如何在Code中使用
+
+见运行示例1的demo代码:
+
+1. [./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc);
+2. [./lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc).
+
+注：这里给出的链接会跳转到线上最新develop分支的代码，很可能与您本地的代码存在差异，建议参考自己本地位于`lite/demo/cxx/`目录的代码，查看如何使用。
+
+**NOTE：** 对OpenCL的支持还在持续开发中。
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 61f07583b2ed920ce7ac0f2d56b2b2e89bb99b42..bac6f80c4721e0c5de201eebfe7e6a39a0bdc73a 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -5,9 +5,11 @@ message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}")
 message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
 message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
 message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
+message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
+message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
 
@@ -65,6 +67,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (LITE_WITH_FPGA)
         set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
     endif(LITE_WITH_FPGA)
+    if (LITE_WITH_BM)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
+    endif(LITE_WITH_BM)
 else()
     set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
 endif()
@@ -160,7 +165,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                     COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
                     COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
                     COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
-                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
                     )
             add_dependencies(tiny_publish_lib bundle_light_api)
             add_dependencies(publish_inference tiny_publish_lib)
@@ -171,11 +176,17 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                     COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                     COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
                     COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+                    COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
                     COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/libpaddle_light_api_shared.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
                     COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                     )
                 add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared)
+                add_dependencies(tiny_publish_cxx_lib bundle_light_api)
                 add_dependencies(publish_inference tiny_publish_cxx_lib)
+                if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+                    add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
+                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
+                endif()
             endif()
         endif()
     endif()
@@ -213,7 +224,16 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
             )
             add_dependencies(publish_inference_android_cxx_demos logging gflags)
             add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -225,6 +245,16 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
             )
             add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
         endif()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 63d53869ea530212ea03b24ef746d980fd13a19b..f7f74ab5822a1305e3e8d24cf36a0a458a6494ff 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -16,32 +16,40 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
     add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
     target_link_libraries(paddle_full_api_shared framework_proto)
     if(LITE_WITH_X86)
-       add_dependencies(paddle_full_api_shared xxhash)
-       target_link_libraries(paddle_full_api_shared xxhash)
+        add_dependencies(paddle_full_api_shared xxhash)
+        target_link_libraries(paddle_full_api_shared xxhash)
+        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) 
+            add_dependencies(paddle_full_api_shared dynload_mklml)
+        endif()
     endif()
     if(LITE_WITH_CUDA)
         target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
-    endif(LITE_WITH_CUDA) 
+    endif(LITE_WITH_CUDA)
+
     #light api dynamic library
     lite_cc_library(paddle_light_api_shared MODULE
-    SRCS light_api_shared.cc
-    DEPS ${light_lib_DEPS}
-    ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels})
+        SRCS light_api_shared.cc
+        DEPS ${light_lib_DEPS}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels})
+
     target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
-    if (LITE_WITH_NPU)
-        # Strips the symbols of our protobuf functions to fix the conflicts during
-        # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
-        set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-        set_target_properties(paddle_light_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-    endif()
+    set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
+    set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
+    add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
+    add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
+    set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
+    add_dependencies(paddle_full_api_shared custom_linker_map)
 else()
     if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
         add_library(paddle_light_api_shared SHARED "")
         target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc)
+        set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
         add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
         if (LITE_WITH_NPU)
             # Need to add HIAI runtime libs (libhiai.so) dependency
-            target_link_libraries(paddle_light_api_shared ${npu_runtime_libs})
+            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
         endif()
     endif()
 endif()
@@ -52,13 +60,19 @@ if (WITH_TESTING)
            ${ops} ${host_kernels}
       CUDA_DEPS ${cuda_kernels}
       X86_DEPS ${x86_kernels}
-      XPU_DEPS ${xpu_kernels})
+      XPU_DEPS ${xpu_kernels}
+      BM_DEPS ${bm_kernels})
 endif()
 if(LITE_WITH_FPGA)
     set(light_api_deps ${light_api_deps} ${fpga_deps})
     set(cxx_api_deps ${cxx_api_deps} ${fpga_deps})
 endif()
 
+if(LITE_WITH_BM)
+    set(light_api_deps ${light_api_deps} ${bm_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
+endif()
+
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
 message(STATUS "get CUDA kernels ${cuda_kernels}")
@@ -67,29 +81,32 @@ message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
+message(STATUS "get BM kernels ${bm_kernels}")
 
 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
     set(cxx_api_deps
-      scope optimizer target_wrapper_host model_parser program)
+    scope optimizer target_wrapper_host model_parser program)
     lite_cc_library(cxx_api
-                    SRCS cxx_api.cc
-                    DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
-                    X86_DEPS ${x86_kernels}
-                    ARM_DEPS ${arm_kernels}
-                    NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
-                    XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
-                    CL_DEPS ${opencl_kernels}
-                    FPGA_DEPS ${fpga_kernels})
+                        SRCS cxx_api.cc
+                        DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
+                        X86_DEPS ${x86_kernels}
+                        CUDA_DEPS ${cuda_kernels}
+                        ARM_DEPS ${arm_kernels}
+                        CV_DEPS paddle_cv_arm
+                        NPU_DEPS ${npu_kernels}
+                        XPU_DEPS ${xpu_kernels}
+                        BM_DEPS ${bm_kernels}
+                        CL_DEPS ${opencl_kernels}
+                        FPGA_DEPS ${fpga_kernels})
 endif()
 
 # for light api
 set(light_api_deps
     scope target_wrapper_host model_parser program)
 if(LITE_WITH_CUDA)
+    get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
     set(light_api_deps ${light_api_deps} target_wrapper_cuda)
-    set(cuda_static_deps cudart_static cublas_static curand_static
-        cudnn_static culibos_static)
 endif()
 lite_cc_library(light_api SRCS light_api.cc
         DEPS scope target_wrapper_host model_parser
@@ -97,10 +114,12 @@ lite_cc_library(light_api SRCS light_api.cc
         CUDA_DEPS ${cuda_kernels}
         X86_DEPS ${x86_kernels}
         ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kernels}
-        FPGA_DEPS ${fpga_kernels})
+        FPGA_DEPS ${fpga_kernels}
+        BM_DEPS ${bm_kernels})
 
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -111,11 +130,14 @@ if(WITH_TESTING)
        DEPS cxx_api mir_passes lite_api_test_helper
        ${ops} ${host_kernels}
        X86_DEPS ${x86_kernels}
+       CUDA_DEPS ${cuda_kernels}
        ARM_DEPS ${arm_kernels}
+       CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
+       BM_DEPS ${bm_kernels}
        EXCLUDE_COMPILE_DEPS "ON"
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
             --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -151,6 +173,12 @@ if(WITH_TESTING)
            ${ops} ${host_kernels} ${x86_kernels}
            ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn)
         add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz)
+        if(LITE_WITH_BM)
+           lite_cc_test(test_resnet50_lite_bm SRCS test_resnet50_lite_bm.cc
+              DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+              ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
+              ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+        endif()
     endif()
 endif()
 
@@ -221,16 +249,18 @@ else()
 endif()
 if (NOT LITE_ON_TINY_PUBLISH)
     lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api paddle_api_light
-      ${ops}
-      ARM_DEPS ${arm_kernels}
-      NPU_DEPS ${npu_kernels}
-      CL_DEPS ${opencl_kernels}
-      FPGA_DEPS ${fpga_kernels})
+        ${ops}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels})
     # The final inference library for just MobileConfig.
     bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
+    target_link_libraries(paddle_api_full ${cuda_deps})
     get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    cc_library(api_full_static SRCS DEPS paddle_api_full cxx_api paddle_api light_api  ${cxx_api_deps} ${ops} ${host_kernels} ${cuda_kernels} program tensor memory naive_buffer types ${fluid_modules} protobuf ${cuda_static_deps})
 endif()
+
 bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api)
 #-----------------------------------------------------------------------------------------------------
 
@@ -240,6 +270,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
         DEPS light_api program mir_passes paddle_api_light
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
+        BM_DEPS ${bm_kernels}
         ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
 
 lite_cc_test(test_apis SRCS apis_test.cc
@@ -248,6 +279,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
         X86_DEPS ${x86_kernels}
         XPU_DEPS ${xpu_kernels}
         FPGA_DEPS ${fpga_kernels}
+        BM_DEPS ${bm_kernels}
         ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
         --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
 
@@ -255,7 +287,7 @@ if (LITE_WITH_JAVA AND LITE_WITH_ARM)
     add_subdirectory(android)
 endif()
 
-if (LITE_WITH_PYTHON) 
+if (LITE_WITH_PYTHON)
     add_subdirectory(python)
 endif()
 
@@ -264,20 +296,22 @@ if (LITE_ON_TINY_PUBLISH)
 endif()
 
 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-    message(STATUS "Compiling model_optimize_tool")
-    lite_cc_binary(model_optimize_tool SRCS model_optimize_tool.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
+    message(STATUS "Compiling opt")
+    lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
         DEPS gflags kernel op optimizer mir_passes utils)
-    add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc)
+    add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 
 lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light
   ${ops}
   ARM_DEPS ${arm_kernels}
+  CV_DEPS paddle_cv_arm
   NPU_DEPS ${npu_kernels}
   XPU_DEPS ${xpu_kernels}
   CL_DEPS ${opencl_kernels}
   X86_DEPS ${x86_kernels}
   FPGA_DEPS ${fpga_kernels}
+  BM_DEPS ${bm_kernels}
   ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
 if (WITH_TESTING)
     add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -285,25 +319,39 @@ endif()
 
 # Some bins
 if(NOT IOS)
-  lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils
-    ${ops} ${host_kernels}
-    ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
-    XPU_DEPS ${xpu_kernels}
-    CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels}
-    CUDA_DEPS ${cuda_kernels})
-  lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
-    ${ops} ${host_kernels}
-    ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
-    XPU_DEPS ${xpu_kernels}
-    CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels}
-    CUDA_DEPS ${cuda_kernels})
+    lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        BM_DEPS ${bm_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
 
+    lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
+    lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+	BM_DEPS ${bm_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
 endif()
 
 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
diff --git a/lite/api/_paddle_use_ops.h b/lite/api/_paddle_use_ops.h
index bdccfab5df67e485b9fef110dc6cc1e9d74b21c3..6da47e53789d651f4a36d0b8d6a7ca1ea5a0a3d3 100644
--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
@@ -108,7 +108,7 @@ USE_LITE_OP(while)
 USE_LITE_OP(lod_reset)
 USE_LITE_OP(lookup_table)
 USE_LITE_OP(multiclass_nms)
-USE_LITE_OP(graph_op)
+USE_LITE_OP(subgraph)
 USE_LITE_OP(sequence_expand)
 USE_LITE_OP(sequence_pool)
 USE_LITE_OP(reduce_max)
diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt
index 3efa980332f25d786d5c880fab9b3ba5af0a1013..c1766772f8aaa417c3da1d72f2692c10c10194b4 100644
--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -25,11 +25,12 @@ if (NOT LITE_ON_TINY_PUBLISH)
     endif()
 else()
     add_library(paddle_lite_jni SHARED "")
+    set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
     target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
     add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
     if (LITE_WITH_NPU)
         # Need to add HIAI runtime libs (libhiai.so) dependency
-        target_link_libraries(paddle_lite_jni ${npu_runtime_libs})
+        target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})
     endif()
 endif()
 
diff --git a/lite/api/android/jni/native/convert_util_jni.h b/lite/api/android/jni/native/convert_util_jni.h
index 5e5d3723e43eb311f64b85f7507a12497d724109..e4adafdc572fdc937f568508aa9d43eb78470d0d 100644
--- a/lite/api/android/jni/native/convert_util_jni.h
+++ b/lite/api/android/jni/native/convert_util_jni.h
@@ -181,6 +181,7 @@ inline MobileConfig jmobileconfig_to_cpp_mobileconfig(JNIEnv *env,
   MobileConfig config;
 
   // set model dir
+  // NOTE: This is a deprecated API and will be removed in latter release.
   jmethodID model_dir_method = env->GetMethodID(
       mobileconfig_jclazz, "getModelDir", "()Ljava/lang/String;");
   jstring java_model_dir =
@@ -190,6 +191,27 @@ inline MobileConfig jmobileconfig_to_cpp_mobileconfig(JNIEnv *env,
     config.set_model_dir(cpp_model_dir);
   }
 
+  // set model from file
+  jmethodID model_file_method = env->GetMethodID(
+      mobileconfig_jclazz, "getModelFromFile", "()Ljava/lang/String;");
+  jstring java_model_file =
+      (jstring)env->CallObjectMethod(jmobileconfig, model_file_method);
+  if (java_model_file != nullptr) {
+    std::string cpp_model_file = jstring_to_cpp_string(env, java_model_file);
+    config.set_model_from_file(cpp_model_file);
+  }
+
+  // set model from buffer
+  jmethodID model_buffer_method = env->GetMethodID(
+      mobileconfig_jclazz, "getModelFromBuffer", "()Ljava/lang/String;");
+  jstring java_model_buffer =
+      (jstring)env->CallObjectMethod(jmobileconfig, model_buffer_method);
+  if (java_model_buffer != nullptr) {
+    std::string cpp_model_buffer =
+        jstring_to_cpp_string(env, java_model_buffer);
+    config.set_model_from_buffer(cpp_model_buffer);
+  }
+
   // set threads
   jmethodID threads_method =
       env->GetMethodID(mobileconfig_jclazz, "getThreads", "()I");
diff --git a/lite/api/android/jni/native/tensor_jni.cc b/lite/api/android/jni/native/tensor_jni.cc
index 59cafa19399c4d265915e2dac8653e9ed7d10851..5212fe9a6eba2b034883da93c9ea5d845a63c773 100644
--- a/lite/api/android/jni/native/tensor_jni.cc
+++ b/lite/api/android/jni/native/tensor_jni.cc
@@ -120,6 +120,22 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B(
   return JNI_TRUE;
 }
 
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3I(
+    JNIEnv *env, jobject jtensor, jintArray buf) {
+  std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+  if (tensor == nullptr || (*tensor == nullptr)) {
+    return JNI_FALSE;
+  }
+  int64_t buf_size = (int64_t)env->GetArrayLength(buf);
+  if (buf_size != product((*tensor)->shape())) {
+    return JNI_FALSE;
+  }
+
+  int32_t *input = (*tensor)->mutable_data<int32_t>();
+  env->GetIntArrayRegion(buf, 0, buf_size, input);
+  return JNI_TRUE;
+}
+
 JNIEXPORT jfloatArray JNICALL
 Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *env, jobject jtensor) {
   if (is_const_tensor(env, jtensor)) {
@@ -148,6 +164,20 @@ Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *env, jobject jtensor) {
   }
 }
 
+JNIEXPORT jintArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getIntData(JNIEnv *env, jobject jtensor) {
+  if (is_const_tensor(env, jtensor)) {
+    std::unique_ptr<const Tensor> *tensor =
+        get_read_only_tensor_pointer(env, jtensor);
+    return cpp_array_to_jintarray(
+        env, (*tensor)->data<int32_t>(), product((*tensor)->shape()));
+  } else {
+    std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+    return cpp_array_to_jintarray(
+        env, (*tensor)->data<int32_t>(), product((*tensor)->shape()));
+  }
+}
+
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(
     JNIEnv *env, jobject jtensor, jlong java_pointer) {
   if (java_pointer == 0) {
diff --git a/lite/api/android/jni/native/tensor_jni.h b/lite/api/android/jni/native/tensor_jni.h
index 34c35b6a76f777895dbe88dc5eadf48c659ee544..9b029dfb4c7431354d5de20c6132236764c6cc66 100644
--- a/lite/api/android/jni/native/tensor_jni.h
+++ b/lite/api/android/jni/native/tensor_jni.h
@@ -16,8 +16,8 @@
 #include <jni.h>
 /* Header for class com_baidu_paddle_lite_Tensor */
 
-#ifndef PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
-#define PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
+#ifndef LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
+#define LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -49,6 +49,14 @@ Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *, jobject);
 JNIEXPORT jbyteArray JNICALL
 Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *, jobject);
 
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    getIntData
+ * Signature: ()[I
+ */
+JNIEXPORT jintArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getIntData(JNIEnv *, jobject);
+
 /*
  * Class:     com_baidu_paddle_lite_Tensor
  * Method:    nativeResize
@@ -73,6 +81,14 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3F(
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B(
     JNIEnv *, jobject, jbyteArray);
 
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    nativeSetData
+ * Signature: ([I)Z
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3I(
+    JNIEnv *, jobject, jintArray);
+
 /*
  * Class:     com_baidu_paddle_lite_Tensor
  * Method:    deleteCppTensor
@@ -87,4 +103,4 @@ Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(JNIEnv *, jobject, jlong);
 #ifdef __cplusplus
 }
 #endif
-#endif  // PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
+#endif  // LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
index 5c71db0c92b344e44ea2927305580de1be293f75..e150f98f22113ef6bcedd5e9882e0bd2a6378c97 100644
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
+++ b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
@@ -64,6 +64,44 @@ public class MobileConfig extends ConfigBase {
         return powerMode.value();
     }
 
+    /**
+     * Set model from file.
+     *
+     * @return
+     */
+    public void setModelFromFile(String modelFile) {
+        this.liteModelFile = modelFile;
+    }
+
+    /**
+     * Returns name of model_file.
+     *  
+     * @return liteModelFile
+     */
+    public String getModelFile() {
+        return liteModelFile;
+    }
+
+    /**
+     * Set model from buffer.
+     *
+     * @return
+     */
+    public void setModelFromBuffer(String modelBuffer) {
+        this.liteModelBuffer = modelBuffer;
+    }
+
+    /**
+     * Returns model buffer
+     *  
+     * @return liteModelBuffer
+     */
+    public String getModelBuffer() {
+        return liteModelBuffer;
+    }
+
     private PowerMode powerMode = PowerMode.LITE_POWER_HIGH;
     private int threads = 1;
+    private String liteModelFile;
+    private String liteModelBuffer;
 }
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java b/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
index ac78800bd2e4903b44332a0a0aefe9c69b75abab..f76841dd413ddda86678eecf8241068dd98b74a4 100644
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
+++ b/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
@@ -108,6 +108,19 @@ public class Tensor {
         return nativeSetData(buf);
     }
 
+    /**
+     * Set the tensor int data.
+     *
+     * @param buf the int array buffer which will be copied into tensor.
+     * @return true if set data successfully.
+     */
+    public boolean setData(int[] buf) {
+        if (readOnly) {
+            return false;
+        }
+        return nativeSetData(buf);
+    }
+
     /**
      * @return shape of the tensor as long array.
      */
@@ -123,12 +136,19 @@ public class Tensor {
      */
     public native byte[] getByteData();
 
+    /**
+     * @return the tensor data as int array.
+     */
+    public native int[] getIntData();
+
     private native boolean nativeResize(long[] dims);
 
     private native boolean nativeSetData(float[] buf);
 
     private native boolean nativeSetData(byte[] buf);
 
+    private native boolean nativeSetData(int[] buf);
+
     /**
      * Delete C++ Tenor object pointed by the input pointer, which is presented by a
      * long value.
diff --git a/lite/api/apis_test.cc b/lite/api/apis_test.cc
index ac2c385d53ea0a1785393cd488d115d20c4264f1..bb852297d11a8862460ed6f12e007d727aca9428 100644
--- a/lite/api/apis_test.cc
+++ b/lite/api/apis_test.cc
@@ -62,7 +62,7 @@ TEST(CXXApi_LightApi, optim_model) {
 
 TEST(CXXApi_LightApi, save_and_load_model) {
   lite::Predictor cxx_api;
-  lite::LightPredictor light_api(FLAGS_optimized_model);
+  lite::LightPredictor light_api(FLAGS_optimized_model + ".nb", false);
 
   // CXXAPi
   {
diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc
index 462a5e2381acf3cc86ca81002a282933f01ee049..718dbe44296f2d197efc5b567cf0cc211835d176 100644
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -13,40 +13,82 @@
 // limitations under the License.
 
 #include <gflags/gflags.h>
+#include <sys/time.h>
+#include <time.h>
+#include <algorithm>
 #include <cstdio>
 #include <fstream>
+#include <iomanip>
+#include <numeric>
 #include <string>
 #include <vector>
 #include "lite/api/paddle_api.h"
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
 #include "lite/core/device_info.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 
+DEFINE_string(model_dir,
+              "",
+              "the path of the model, set model_dir when the model is no "
+              "combined formate. This option will be ignored if model_file "
+              "and param_file are exist.");
+DEFINE_string(model_file,
+              "",
+              "the path of model file, set model_file when the model is "
+              "combined formate.");
+DEFINE_string(param_file,
+              "",
+              "the path of param file, set param_file when the model is "
+              "combined formate.");
 DEFINE_string(input_shape,
               "1,3,224,224",
-              "input shapes, separated by colon and comma");
-DEFINE_string(result_filename, "", "save test result");
+              "set input shapes according to the model, "
+              "separated by colon and comma, "
+              "such as 1,3,244,244:1,3,300,300.");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_int32(power_mode,
+             3,
+             "arm power mode: "
+             "0 for big cluster, "
+             "1 for little cluster, "
+             "2 for all cores, "
+             "3 for no bind");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_string(result_filename,
+              "result.txt",
+              "save benchmark "
+              "result to the file");
 DEFINE_bool(run_model_optimize,
             false,
-            "if set true, apply model_optimize_tool to model, use optimized "
-            "model to test");
-DEFINE_bool(is_quantized_model, false, "if set true, test the quantized model");
+            "if set true, apply model_optimize_tool to "
+            "model and use optimized model to test. ");
+DEFINE_bool(is_quantized_model,
+            false,
+            "if set true, "
+            "test the performance of the quantized model. ");
 
 namespace paddle {
 namespace lite_api {
 
-void OutputOptModel(const std::string& load_model_dir,
-                    const std::string& save_optimized_model_dir,
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+void OutputOptModel(const std::string& save_optimized_model_dir,
                     const std::vector<std::vector<int64_t>>& input_shapes) {
   lite_api::CxxConfig config;
-  config.set_model_dir(load_model_dir);
-  std::vector<Place> vaild_places = {Place{TARGET(kARM), PRECISION(kFloat)},
-                                     Place{TARGET(kX86), PRECISION(kFloat)},
-                                     Place{TARGET(kOpenCL), PRECISION(kFloat)}};
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_model_file(FLAGS_model_file);
+  config.set_param_file(FLAGS_param_file);
+  std::vector<Place> vaild_places = {
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  };
   if (FLAGS_is_quantized_model) {
     vaild_places.insert(vaild_places.begin(),
                         Place{TARGET(kARM), PRECISION(kInt8)});
@@ -58,34 +100,33 @@ void OutputOptModel(const std::string& load_model_dir,
       paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
           .c_str());
   if (ret == 0) {
-    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
+    LOG(INFO) << "Delete old optimized model " << save_optimized_model_dir;
   }
   predictor->SaveOptimizedModel(save_optimized_model_dir,
                                 LiteModelType::kNaiveBuffer);
-  LOG(INFO) << "Load model from " << load_model_dir;
+  LOG(INFO) << "Load model from " << FLAGS_model_dir;
   LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
 }
 
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 void Run(const std::vector<std::vector<int64_t>>& input_shapes,
          const std::string& model_dir,
-         const int repeat,
-         const int thread_num,
-         const int warmup_times,
          const std::string model_name) {
+  // set config and create predictor
   lite_api::MobileConfig config;
-  config.set_threads(thread_num);
-  config.set_power_mode(LITE_POWER_NO_BIND);
-  config.set_model_dir(model_dir);
+  config.set_threads(FLAGS_threads);
+  config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
+  config.set_model_from_file(model_dir + ".nb");
 
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
+  // set input
   for (int j = 0; j < input_shapes.size(); ++j) {
     auto input_tensor = predictor->GetInput(j);
     input_tensor->Resize(input_shapes[j]);
     auto input_data = input_tensor->mutable_data<float>();
     int input_num = 1;
-    for (int i = 0; i < input_shapes[j].size(); ++i) {
+    for (size_t i = 0; i < input_shapes[j].size(); ++i) {
       input_num *= input_shapes[j][i];
     }
     for (int i = 0; i < input_num; ++i) {
@@ -93,26 +134,37 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
     }
   }
 
-  for (int i = 0; i < warmup_times; ++i) {
+  // warmup
+  for (int i = 0; i < FLAGS_warmup; ++i) {
     predictor->Run();
   }
 
-  auto start = lite::GetCurrentUS();
-  for (int i = 0; i < repeat; ++i) {
+  // run
+  std::vector<float> perf_vct;
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    auto start = GetCurrentUS();
     predictor->Run();
+    auto end = GetCurrentUS();
+    perf_vct.push_back((end - start) / 1000.0);
   }
-  auto end = lite::GetCurrentUS();
-
-  std::FILE* pf = std::fopen(FLAGS_result_filename.c_str(), "a");
-  if (nullptr == pf) {
-    LOG(INFO) << "create result file error";
-    exit(0);
+  std::sort(perf_vct.begin(), perf_vct.end());
+  float min_res = perf_vct.back();
+  float max_res = perf_vct.front();
+  float total_res = accumulate(perf_vct.begin(), perf_vct.end(), 0.0);
+  float avg_res = total_res / FLAGS_repeats;
+
+  // save result
+  std::ofstream ofs(FLAGS_result_filename, std::ios::app);
+  if (!ofs.is_open()) {
+    LOG(FATAL) << "open result file failed";
   }
-  fprintf(pf,
-          "-- %-18s    avg = %5.4f ms\n",
-          model_name.c_str(),
-          (end - start) / repeat / 1000.0);
-  std::fclose(pf);
+  ofs.precision(5);
+  ofs << std::setw(30) << std::fixed << std::left << model_name;
+  ofs << "min = " << std::setw(12) << min_res;
+  ofs << "max = " << std::setw(12) << max_res;
+  ofs << "average = " << std::setw(12) << avg_res;
+  ofs << std::endl;
+  ofs.close();
 }
 #endif
 
@@ -122,9 +174,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (FLAGS_model_dir == "" || FLAGS_result_filename == "") {
-    LOG(INFO) << "usage: "
-              << "--model_dir /path/to/your/model --result_filename "
-                 "/path/to/resultfile";
+    LOG(INFO) << "please run ./benchmark_bin --help to obtain usage.";
     exit(0);
   }
 
@@ -166,26 +216,20 @@ int main(int argc, char** argv) {
 
   std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
   std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
     input_shapes.push_back(get_shape(str_input_shapes[i]));
   }
 
-  // Output optimized model
+  // Output optimized model if needed
   if (FLAGS_run_model_optimize) {
-    paddle::lite_api::OutputOptModel(
-        FLAGS_model_dir, save_optimized_model_dir, input_shapes);
+    paddle::lite_api::OutputOptModel(save_optimized_model_dir, input_shapes);
   }
 
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
   // Run inference using optimized model
   std::string run_model_dir =
       FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
-  paddle::lite_api::Run(input_shapes,
-                        run_model_dir,
-                        FLAGS_repeats,
-                        FLAGS_threads,
-                        FLAGS_warmup,
-                        model_name);
+  paddle::lite_api::Run(input_shapes, run_model_dir, model_name);
 #endif
   return 0;
 }
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index cbe938cea6e5f84dfb3718585da0880e16cd5bfc..f6f7ec75e65ff54e3f3642822e51057d3522ae3a 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -24,13 +24,6 @@
 namespace paddle {
 namespace lite {
 
-static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
-    ".tailored_ops_source_list";
-static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
-static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
-    ".tailored_kernels_source_list";
-static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
-
 void Predictor::SaveModel(const std::string &dir,
                           lite_api::LiteModelType model_type,
                           bool record_info) {
@@ -50,6 +43,7 @@ void Predictor::SaveModel(const std::string &dir,
       LOG(FATAL) << "Unknown model type";
   }
   if (record_info) {
+    MkDirRecur(dir);
     SaveOpKernelInfo(dir);
   }
 }
@@ -128,6 +122,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
             << kpf_path;
 }
 
+#ifndef LITE_WITH_FPGA
 lite::Tensor *Predictor::GetInput(size_t offset) {
   CHECK(input_names_.size() > offset)
       << "The network has " << input_names_.size() << " inputs"
@@ -137,6 +132,17 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
                 << " in exec_scope";
   return in_var->GetMutable<lite::Tensor>();
 }
+#else
+lite::Tensor *Predictor::GetInput(size_t offset) {
+  auto *_feed_list = exec_scope_->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+#endif
 
 // get inputs names
 std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
@@ -149,10 +155,10 @@ void Predictor::PrepareFeedFetch() {
   if (!program_) {
     GenRuntimeProgram();
   }
+
   std::vector<const cpp::OpDesc *> feeds;
   std::vector<const cpp::OpDesc *> fetchs;
   const auto &insts = program_->instructions();
-
   for (size_t i = 0; i < program_->num_instructions(); i++) {
     const auto &op = insts[i].op()->op_info();
     if (op->Type() == "feed") {
@@ -174,6 +180,8 @@ void Predictor::PrepareFeedFetch() {
   }
 }
 
+#ifndef LITE_WITH_FPGA
+
 const lite::Tensor *Predictor::GetOutput(size_t offset) const {
   CHECK(output_names_.size() > offset)
       << "The network has " << output_names_.size() << " outputs"
@@ -193,6 +201,29 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
   }
   return outputs;
 }
+#else
+
+const lite::Tensor *Predictor::GetOutput(size_t offset) const {
+  auto *_fetch_list = exec_scope_->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+
+std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
+  auto *_fetch_list = exec_scope_->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+
+  std::vector<const lite::Tensor *> outputs;
+  for (auto out : fetch_list) {
+    outputs.push_back(&out);
+  }
+  return outputs;
+}
+
+#endif
 
 const cpp::ProgramDesc &Predictor::program_desc() const {
   return program_desc_;
@@ -208,7 +239,11 @@ void Predictor::Build(const lite_api::CxxConfig &config,
   const std::string &model_file = config.model_file();
   const std::string &param_file = config.param_file();
   const bool model_from_memory = config.model_from_memory();
-  LOG(INFO) << "load from memory " << model_from_memory;
+  if (model_from_memory) {
+    LOG(INFO) << "Load model from memory.";
+  } else {
+    LOG(INFO) << "Load model from file.";
+  }
 
   Build(model_path,
         model_file,
@@ -242,7 +277,7 @@ void Predictor::Build(const std::string &model_path,
     case lite_api::LiteModelType::kNaiveBuffer:
       CHECK(!model_path.empty())
           << "NaiveBuffer backend only supported combined param";
-      LoadModelNaive(model_path, scope_.get(), &program_desc_);
+      LoadModelNaiveFromFile(model_path, scope_.get(), &program_desc_);
       break;
     default:
       LOG(FATAL) << "Unknown model type";
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
index 502ce812e1f4a7f520e89e6eaff020c5853f5308..504710d9fa29420b8762f31e0c675b59c6c626bd 100644
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -29,6 +29,13 @@
 namespace paddle {
 namespace lite {
 
+static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
+    ".tailored_ops_source_list";
+static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
+static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
+    ".tailored_kernels_source_list";
+static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
+
 /*
  * Predictor for inference, input a model, it will optimize and execute it.
  */
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 6fa400db6da9f029c38b496cd70d593a876628c9..81ea60eac66849f8ce42fb8cb210226d18bbfa9b 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -20,6 +20,12 @@
 #include "lite/core/device_info.h"
 #include "lite/core/version.h"
 
+#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+#include <omp.h>
+#include "lite/backends/x86/mklml.h"
+#endif
+
 namespace paddle {
 namespace lite {
 
@@ -33,6 +39,17 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 
   mode_ = config.power_mode();
   threads_ = config.threads();
+
+#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+  int num_threads = config.x86_math_library_num_threads();
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
+  omp_set_num_threads(real_num_threads);
+  VLOG(3) << "set_x86_math_library_math_threads() is set successfully and the "
+             "number of threads is:"
+          << num_threads;
+#endif
 }
 
 std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInput(int i) {
diff --git a/lite/api/cxx_api_test.cc b/lite/api/cxx_api_test.cc
index 4d711302cb5880247f4a7b7082185c500b9ad6e9..cdf1e838366f4bcafc1c1c991d8805f115de7345 100644
--- a/lite/api/cxx_api_test.cc
+++ b/lite/api/cxx_api_test.cc
@@ -101,7 +101,7 @@ TEST(CXXApi, save_model) {
 TEST(CXXApi, load_model_naive) {
   lite::Predictor predictor;
   std::vector<Place> valid_places({Place{TARGET(kARM), PRECISION(kFloat)}});
-  predictor.Build(FLAGS_optimized_model + ".naive",
+  predictor.Build(FLAGS_optimized_model + ".naive.nb",
                   "",
                   "",
                   valid_places,
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index a0c4b7e5e375d9d004de63345ba5013ee6c252b9..29d8f4f29ab822f8c9601bbd63a3626abbbf1818 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -18,6 +18,17 @@
 namespace paddle {
 namespace lite {
 
+void LightPredictor::Build(const std::string& lite_model_file,
+                           bool model_from_memory) {
+  if (model_from_memory) {
+    LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
+  } else {
+    LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
+  }
+  BuildRuntimeProgram(cpp_program_desc_);
+  PrepareFeedFetch();
+}
+
 void LightPredictor::Build(const std::string& model_dir,
                            const std::string& model_buffer,
                            const std::string& param_buffer,
@@ -41,6 +52,8 @@ void LightPredictor::Build(const std::string& model_dir,
     default:
       LOG(FATAL) << "Unknown model type";
   }
+
+  DequantizeWeight();
   BuildRuntimeProgram(cpp_program_desc_);
   PrepareFeedFetch();
 }
@@ -144,5 +157,69 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
   program_->set_exec_scope(program.exec_scope());
 }
 
+void LightPredictor::DequantizeWeight() {
+#define PROCESS_CONV2D_DATA()                                   \
+  for (int64_t i = 0; i < h; ++i) {                             \
+    for (int64_t j = 0; j < w; ++j) {                           \
+      fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \
+    }                                                           \
+  }
+
+#define PROCESS_FC_DATA()                           \
+  for (int i = 0; i < input_tensor->numel(); i++) { \
+    *fp_data = scale_list[0] * (*int_data);         \
+    ++fp_data;                                      \
+    ++int_data;                                     \
+  }
+
+  Tensor tmp_tensor;
+  CHECK(cpp_program_desc_.BlocksSize());
+  auto* main_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
+  for (size_t k = 0; k < main_block->OpsSize(); ++k) {
+    auto* op_desc = main_block->GetOp<cpp::OpDesc>(k);
+    if (op_desc->HasAttr("quantize_weight_bits")) {  //  weight quantized op
+      auto input_names = op_desc->input_vars();
+      for (auto& input_name : input_names) {
+        std::string input_scale_name = input_name + "_quant_scale";
+        if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
+          auto input_tensor =
+              scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
+          tmp_tensor.CopyDataFrom(*input_tensor);
+          auto scale_list =
+              op_desc->GetAttr<std::vector<float>>(input_scale_name);
+          int quantize_weight_bits =
+              op_desc->GetAttr<int>("quantize_weight_bits");
+          float* fp_data = input_tensor->mutable_data<float>();
+
+          std::string op_type = op_desc->Type();
+          if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
+            int64_t h = input_tensor->dims()[0];
+            int64_t w = input_tensor->numel() / h;
+            CHECK_EQ(scale_list.size(), h);
+            if (quantize_weight_bits == 8) {
+              const int8_t* int_data = tmp_tensor.data<int8_t>();
+              PROCESS_CONV2D_DATA()
+            } else {
+              const int16_t* int_data = tmp_tensor.data<int16_t>();
+              PROCESS_CONV2D_DATA()
+            }
+          } else if (op_type == "fc" || op_type == "mul") {
+            if (quantize_weight_bits == 8) {
+              const int8_t* int_data = tmp_tensor.data<int8_t>();
+              PROCESS_FC_DATA()
+            } else {
+              const int16_t* int_data = tmp_tensor.data<int16_t>();
+              PROCESS_FC_DATA()
+            }
+          }
+        }
+      }
+    }
+  }
+
+#undef PROCESS_CONV2D_DATA
+#undef PROCESS_FC_DATA
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/api/light_api.h b/lite/api/light_api.h
index 3781bc4d674db5d2e8794edaf33f00627b9977bb..aa25ea81c7b62238211f96265a4edc49f2d065a1 100644
--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -18,6 +18,7 @@
  */
 #pragma once
 
+#include <algorithm>
 #include <map>
 #include <memory>
 #include <string>
@@ -39,12 +40,22 @@ namespace lite {
  */
 class LITE_API LightPredictor {
  public:
-  LightPredictor(
-      const std::string& model_dir,
-      const std::string& model_buffer = "",
-      const std::string& param_buffer = "",
-      bool model_from_memory = false,
-      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf) {
+  // constructor function of LightPredictor, `lite_model_file` refers to data in
+  // model file or buffer,`model_from_memory` refers to whther to load model
+  // from memory.
+  LightPredictor(const std::string& lite_model_file,
+                 bool model_from_memory = false) {
+    scope_ = std::make_shared<Scope>();
+    Build(lite_model_file, model_from_memory);
+  }
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
+  LightPredictor(const std::string& model_dir,
+                 const std::string& model_buffer = "",
+                 const std::string& param_buffer = "",
+                 bool model_from_memory = false,
+                 lite_api::LiteModelType model_type =
+                     lite_api::LiteModelType::kNaiveBuffer) {
     scope_ = std::make_shared<Scope>();
     Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
   }
@@ -69,6 +80,10 @@ class LITE_API LightPredictor {
   void PrepareFeedFetch();
 
  private:
+  void Build(const std::string& lite_model_file,
+             bool model_from_memory = false);
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
   void Build(
       const std::string& model_dir,
       const std::string& model_buffer,
@@ -78,6 +93,8 @@ class LITE_API LightPredictor {
 
   void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
 
+  void DequantizeWeight();
+
  private:
   std::shared_ptr<Scope> scope_;
   std::unique_ptr<RuntimeProgram> program_;
diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc
index a0ae28df0958403237114a3d4b94031829019339..3965843250abe45c43490bdbb4aaed58915e0908 100644
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -23,13 +23,17 @@ namespace lite {
 
 void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
   // LightPredictor Only support NaiveBuffer backend in publish lib
-  raw_predictor_.reset(
-      new LightPredictor(config.model_dir(),
-                         config.model_buffer(),
-                         config.param_buffer(),
-                         config.model_from_memory(),
-                         lite_api::LiteModelType::kNaiveBuffer));
-
+  if (config.lite_model_file().empty()) {
+    raw_predictor_.reset(
+        new LightPredictor(config.model_dir(),
+                           config.model_buffer(),
+                           config.param_buffer(),
+                           config.model_from_memory(),
+                           lite_api::LiteModelType::kNaiveBuffer));
+  } else {
+    raw_predictor_.reset(new LightPredictor(config.lite_model_file(),
+                                            config.model_from_memory()));
+  }
   mode_ = config.power_mode();
   threads_ = config.threads();
 }
diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..addd512eb0039c43edeca562b8f568528aab76f9
--- /dev/null
+++ b/lite/api/lite_multithread_test.cc
@@ -0,0 +1,360 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/device_info.h"
+#include "lite/core/profile/timer.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/basic_profiler.h"
+#endif             // LITE_WITH_PROFILE
+#include <thread>  // NOLINT
+
+using paddle::lite::profile::Timer;
+
+DEFINE_string(input_shape,
+              "1,3,224,224",
+              "input shapes, separated by colon and comma");
+
+DEFINE_string(model_dir_0, "", "model_dir_0");
+DEFINE_string(input_shape_0,
+              "1,3,224,224",
+              "input shapes another, separated by colon and comma");
+
+DEFINE_bool(use_optimize_nb,
+            false,
+            "optimized & naive buffer model for mobile devices");
+
+DEFINE_int32(test_type, 0, "multithread test type");
+
+namespace paddle {
+namespace lite_api {
+
+void OutputOptModel(const std::string& load_model_dir,
+                    const std::string& save_optimized_model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  config.set_valid_places({
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  // delete old optimized model
+  int ret = system(
+      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
+          .c_str());
+  if (ret == 0) {
+    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
+  }
+  predictor->SaveOptimizedModel(save_optimized_model_dir,
+                                LiteModelType::kNaiveBuffer);
+  LOG(INFO) << "Load model from " << load_model_dir;
+  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
+}
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+void Run(const std::vector<std::vector<int64_t>>& input_shapes,
+         const std::string& model_dir,
+         const PowerMode power_mode,
+         const int thread_num,
+         const int repeat,
+         int tid,
+         const int warmup_times = 5) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+    for (int i = 0; i < input_num; ++i) {
+      input_data[i] = 1.f;
+    }
+  }
+
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+
+  Timer ti;
+  for (int j = 0; j < repeat; ++j) {
+    ti.Start();
+    predictor->Run();
+    float t = ti.Stop();
+    auto output = predictor->GetOutput(0);
+    auto out = output->data<float>();
+    LOG(INFO) << "[thread " << tid << "] Model: " << model_dir
+              << " output[0]:" << out[0] << "; output[1]:" << out[1];
+  }
+  LOG(INFO) << "[thread " << tid << "] Model: " << model_dir
+            << ", power_mode: " << static_cast<int>(power_mode)
+            << ", threads num " << thread_num
+            << ", avg time: " << ti.LapTimes().Avg() << "ms"
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";
+}
+
+void RunTestType_00(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    const int warmup_times = 5) {
+  std::thread run_th0(Run,
+                      input_shapes,
+                      model_dir,
+                      power_mode,
+                      thread_num,
+                      repeat,
+                      0,
+                      warmup_times);
+  Run(input_shapes, model_dir, power_mode, thread_num, repeat, 1, warmup_times);
+  run_th0.join();
+}
+void RunTestType_01(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes_0,
+                    const std::string& model_dir_0,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    const int warmup_times = 5) {
+  std::thread run_th0(Run,
+                      input_shapes,
+                      model_dir,
+                      power_mode,
+                      thread_num,
+                      repeat,
+                      0,
+                      warmup_times);
+  Run(input_shapes_0,
+      model_dir_0,
+      power_mode,
+      thread_num,
+      repeat,
+      1,
+      warmup_times);
+  run_th0.join();
+}
+
+void run_with_predictor(std::shared_ptr<PaddlePredictor> predictor,
+                        const std::vector<std::vector<int64_t>>& input_shapes,
+                        int index,
+                        const std::string& name) {
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+    for (int i = 0; i < input_num; ++i) {
+      input_data[i] = 1.f;
+    }
+  }
+
+  Timer ti;
+  ti.Start();
+  predictor->Run();
+  float t = ti.Stop();
+
+  auto output = predictor->GetOutput(0);
+  auto out = output->data<float>();
+  LOG(INFO) << "[thread " << index << "] name: " << name
+            << ",run time: " << ti.LapTimes().Avg() << "ms"
+            << " output[0]:" << out[0] << "; output[1]:" << out[1];
+}
+void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    int warmup = 5) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  for (int i = 0; i < repeat; ++i) {
+    std::thread pre_th0(
+        run_with_predictor, predictor, input_shapes, i, model_dir);
+    pre_th0.join();
+  }
+}
+void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes_0,
+                    const std::string& model_dir_0,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    int warmup = 5) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  config.set_model_dir(model_dir_0);
+  auto predictor_0 = lite_api::CreatePaddlePredictor(config);
+
+  for (int i = 0; i < 2 * repeat; i += 2) {
+    std::thread pre_th0(
+        run_with_predictor, predictor, input_shapes, i, model_dir);
+    std::thread pre_th1(
+        run_with_predictor, predictor_0, input_shapes_0, i + 1, model_dir_0);
+    pre_th0.join();
+    pre_th1.join();
+  }
+}
+
+#endif
+
+}  // namespace lite_api
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir == "") {
+    LOG(INFO) << "usage: "
+              << "--model_dir /path/to/your/model";
+    exit(0);
+  }
+  std::string save_optimized_model_dir = "";
+  std::string save_optimized_model_dir_0 = "";
+  if (FLAGS_use_optimize_nb) {
+    save_optimized_model_dir = FLAGS_model_dir;
+    save_optimized_model_dir_0 = FLAGS_model_dir_0;
+  } else {
+    save_optimized_model_dir = FLAGS_model_dir + "opt2";
+    save_optimized_model_dir_0 = FLAGS_model_dir_0 + "opt2";
+  }
+
+  auto split_string =
+      [](const std::string& str_in) -> std::vector<std::string> {
+    std::vector<std::string> str_out;
+    std::string tmp_str = str_in;
+    while (!tmp_str.empty()) {
+      size_t next_offset = tmp_str.find(":");
+      str_out.push_back(tmp_str.substr(0, next_offset));
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return str_out;
+  };
+
+  auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
+    std::vector<int64_t> shape;
+    std::string tmp_str = str_shape;
+    while (!tmp_str.empty()) {
+      int dim = atoi(tmp_str.data());
+      shape.push_back(dim);
+      size_t next_offset = tmp_str.find(",");
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return shape;
+  };
+
+  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
+  std::vector<std::vector<int64_t>> input_shapes;
+  for (int i = 0; i < str_input_shapes.size(); ++i) {
+    input_shapes.push_back(get_shape(str_input_shapes[i]));
+  }
+  std::vector<std::string> str_input_shapes_0 =
+      split_string(FLAGS_input_shape_0);
+  std::vector<std::vector<int64_t>> input_shapes_0;
+  for (int i = 0; i < str_input_shapes_0.size(); ++i) {
+    input_shapes_0.push_back(get_shape(str_input_shapes_0[i]));
+  }
+
+  if (!FLAGS_use_optimize_nb) {
+    // Output optimized model
+    paddle::lite_api::OutputOptModel(
+        FLAGS_model_dir, save_optimized_model_dir, input_shapes);
+    paddle::lite_api::OutputOptModel(
+        FLAGS_model_dir_0, save_optimized_model_dir_0, input_shapes_0);
+  }
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  // Run inference using optimized model
+  if (FLAGS_test_type == 0) {
+    paddle::lite_api::RunTestType_00(
+        input_shapes,
+        save_optimized_model_dir,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats,
+        5);
+    LOG(INFO) << "=========above is case 0, below is case "
+                 "1============================";
+    paddle::lite_api::RunTestType_10(
+        input_shapes,
+        save_optimized_model_dir,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats);
+  }
+  if (FLAGS_test_type == 1) {
+    paddle::lite_api::RunTestType_01(
+        input_shapes,
+        save_optimized_model_dir,
+        input_shapes_0,
+        save_optimized_model_dir_0,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats,
+        5);
+    LOG(INFO) << "=========above is case 0, below is case "
+                 "1============================";
+    paddle::lite_api::RunTestType_11(
+        input_shapes,
+        save_optimized_model_dir,
+        input_shapes_0,
+        save_optimized_model_dir_0,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats);
+  }
+
+#endif
+  return 0;
+}
diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc
index 79f9bea762e099b249f597dddb7df790361edc2a..bcc9644f81542ab6fb8a0badf8ecaea89fc8dedb 100644
--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -23,6 +23,10 @@
 #include "lite/core/op_registry.h"
 
 DEFINE_string(optimized_model, "", "optimized_model");
+DEFINE_int32(N, 1, "input_batch");
+DEFINE_int32(C, 3, "input_channel");
+DEFINE_int32(H, 224, "input_height");
+DEFINE_int32(W, 224, "input_width");
 
 namespace paddle {
 namespace lite {
@@ -37,7 +41,8 @@ void TestModel(const std::vector<Place>& valid_places,
   predictor.Build(model_dir, "", "", valid_places);
 
   auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W})));
   auto* data = input_tensor->mutable_data<float>();
   auto item_size = input_tensor->dims().production();
   for (int i = 0; i < item_size; i++) {
@@ -58,6 +63,8 @@ void TestModel(const std::vector<Place>& valid_places,
     predictor.SaveModel(FLAGS_optimized_model);
   }
 
+  LOG(INFO) << "input shape(NCHW):" << FLAGS_N << " " << FLAGS_C << " "
+            << FLAGS_H << " " << FLAGS_W;
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
             << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
@@ -123,10 +130,10 @@ TEST(MobileNetV1, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV1, test_opencl) {
   std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
-      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
       Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
       TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
   });
 
diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc
index 84bd27e352f549d619cfa51f9127f973023e6d45..012d6d48d9e6d3747f83a7f1089944bbaf359f71 100644
--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -23,6 +23,10 @@
 #include "lite/core/op_registry.h"
 
 DEFINE_string(optimized_model, "", "optimized_model");
+DEFINE_int32(N, 1, "input_batch");
+DEFINE_int32(C, 3, "input_channel");
+DEFINE_int32(H, 224, "input_height");
+DEFINE_int32(W, 224, "input_width");
 
 namespace paddle {
 namespace lite {
@@ -38,7 +42,8 @@ void TestModel(const std::vector<Place>& valid_places,
   predictor.Build(model_dir, "", "", valid_places);
 
   auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W})));
   auto* data = input_tensor->mutable_data<float>();
   auto item_size = input_tensor->dims().production();
   for (int i = 0; i < item_size; i++) {
@@ -59,6 +64,8 @@ void TestModel(const std::vector<Place>& valid_places,
     predictor.SaveModel(FLAGS_optimized_model);
   }
 
+  LOG(INFO) << "input shape(NCHW):" << FLAGS_N << " " << FLAGS_C << " "
+            << FLAGS_H << " " << FLAGS_W;
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
             << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
@@ -123,8 +130,11 @@ TEST(MobileNetV2, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV2, test_opencl) {
   std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
+      TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
   });
 
   TestModel(valid_places);
diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc
deleted file mode 100644
index daa57cd45632764172426cc41914abc7f82bea33..0000000000000000000000000000000000000000
--- a/lite/api/model_optimize_tool.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#ifdef PADDLE_WITH_TESTING
-#include <gtest/gtest.h>
-#endif
-// "all_kernel_faked.cc" and "kernel_src_map.h" are created automatically during
-// model_optimize_tool's compiling period
-#include "all_kernel_faked.cc"  // NOLINT
-#include "kernel_src_map.h"     // NOLINT
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/op_registry.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/string.h"
-
-DEFINE_string(model_dir,
-              "",
-              "path of the model. This option will be ignored if model_file "
-              "and param_file are exist");
-DEFINE_string(model_file, "", "model file path of the combined-param model");
-DEFINE_string(param_file, "", "param file path of the combined-param model");
-DEFINE_string(
-    optimize_out_type,
-    "protobuf",
-    "store type of the output optimized model. protobuf/naive_buffer");
-DEFINE_bool(display_kernels, false, "Display kernel information");
-DEFINE_bool(record_tailoring_info,
-            false,
-            "Record kernels and operators information of the optimized model "
-            "for tailoring compiling, information are stored into optimized "
-            "model path as hidden files");
-DEFINE_string(optimize_out, "", "path of the output optimized model");
-DEFINE_string(valid_targets,
-              "arm",
-              "The targets this model optimized for, should be one of (arm, "
-              "opencl, x86), splitted by space");
-DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
-
-namespace paddle {
-namespace lite_api {
-
-//! Display the kernel information.
-void DisplayKernels() {
-  LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString();
-}
-
-void Main() {
-  if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) {
-    LOG(WARNING)
-        << "Load combined-param model. Option model_dir will be ignored";
-  }
-
-  if (FLAGS_display_kernels) {
-    DisplayKernels();
-    exit(0);
-  }
-
-  lite_api::CxxConfig config;
-  config.set_model_dir(FLAGS_model_dir);
-  config.set_model_file(FLAGS_model_file);
-  config.set_param_file(FLAGS_param_file);
-
-  std::vector<Place> valid_places;
-  auto target_reprs = lite::Split(FLAGS_valid_targets, " ");
-  for (auto& target_repr : target_reprs) {
-    if (target_repr == "arm") {
-      valid_places.emplace_back(TARGET(kARM));
-    } else if (target_repr == "opencl") {
-      valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
-      valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
-      valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
-      valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
-      valid_places.emplace_back(
-          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
-    } else if (target_repr == "x86") {
-      valid_places.emplace_back(TARGET(kX86));
-    } else {
-      LOG(FATAL) << lite::string_format(
-          "Wrong target '%s' found, please check the command flag "
-          "'valid_targets'",
-          target_repr.c_str());
-    }
-  }
-
-  CHECK(!valid_places.empty())
-      << "At least one target should be set, should set the "
-         "command argument 'valid_targets'";
-
-  if (FLAGS_prefer_int8_kernel) {
-    LOG(WARNING) << "Int8 mode is only support by ARM target";
-    valid_places.insert(valid_places.begin(),
-                        Place{TARGET(kARM), PRECISION(kInt8)});
-  }
-  config.set_valid_places(valid_places);
-
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-
-  LiteModelType model_type;
-  if (FLAGS_optimize_out_type == "protobuf") {
-    model_type = LiteModelType::kProtobuf;
-  } else if (FLAGS_optimize_out_type == "naive_buffer") {
-    model_type = LiteModelType::kNaiveBuffer;
-  } else {
-    LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type;
-  }
-  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
-
-  predictor->SaveOptimizedModel(
-      FLAGS_optimize_out, model_type, FLAGS_record_tailoring_info);
-  if (FLAGS_record_tailoring_info) {
-    LOG(INFO) << "Record the information of tailored model into :"
-              << FLAGS_optimize_out;
-  }
-}
-
-}  // namespace lite_api
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, false);
-  paddle::lite_api::Main();
-  return 0;
-}
diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc
index 1358267000991c81b80453669cf46638449b8a7b..190890da4c109f39cc52ca5209cd952f8937f780 100644
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gflags/gflags.h>
+#include <sstream>
 #include <string>
 #include <vector>
 #include "lite/api/paddle_api.h"
@@ -21,22 +22,22 @@
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
 #include "lite/core/device_info.h"
-#include "lite/tests/utils/timer.h"
+#include "lite/core/profile/timer.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/basic_profiler.h"
 #endif  // LITE_WITH_PROFILE
 
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_string(input_shape,
               "1,3,224,224",
               "input shapes, separated by colon and comma");
-
 DEFINE_bool(use_optimize_nb,
             false,
             "optimized & naive buffer model for mobile devices");
+DEFINE_string(arg_name, "", "the arg name");
 
 namespace paddle {
 namespace lite_api {
@@ -47,7 +48,6 @@ void OutputOptModel(const std::string& load_model_dir,
   lite_api::CxxConfig config;
   config.set_model_dir(load_model_dir);
   config.set_valid_places({
-      Place{TARGET(kX86), PRECISION(kFloat)},
       Place{TARGET(kARM), PRECISION(kFloat)},
   });
   auto predictor = lite_api::CreatePaddlePredictor(config);
@@ -72,12 +72,8 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
          const int thread_num,
          const int repeat,
          const int warmup_times = 0) {
-#ifdef LITE_WITH_PROFILE
-  lite::profile::BasicProfiler<lite::profile::BasicTimer>::Global().SetWarmup(
-      warmup_times);
-#endif
   lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
   config.set_power_mode(power_mode);
   config.set_threads(thread_num);
 
@@ -91,6 +87,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
     for (int i = 0; i < input_shapes[j].size(); ++i) {
       input_num *= input_shapes[j][i];
     }
+
     for (int i = 0; i < input_num; ++i) {
       input_data[i] = 1.f;
     }
@@ -102,20 +99,20 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
 
   Timer ti;
   for (int j = 0; j < repeat; ++j) {
-    ti.start();
+    ti.Start();
     predictor->Run();
-    ti.end();
-    LOG(INFO) << "iter: " << j << ", time: " << ti.latest_time() << " ms";
+    float t = ti.Stop();
+    LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
   }
 
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << model_dir
             << ", power_mode: " << static_cast<int>(power_mode)
             << ", threads num " << thread_num << ", warmup: " << warmup_times
-            << ", repeats: " << repeat << ", avg time: " << ti.get_average_ms()
+            << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
             << " ms"
-            << ", min time: " << ti.get_min_time() << " ms"
-            << ", max time: " << ti.get_max_time() << " ms.";
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";
 
   auto output = predictor->GetOutput(0);
   auto out = output->data<float>();
@@ -127,6 +124,28 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
     output_num *= output_shape[i];
   }
   LOG(INFO) << "output_num: " << output_num;
+
+  // please turn off memory_optimize_pass to use this feature.
+  if (FLAGS_arg_name != "") {
+    auto arg_tensor = predictor->GetTensor(FLAGS_arg_name);
+    auto arg_shape = arg_tensor->shape();
+    int arg_num = 1;
+    std::ostringstream os;
+    os << "{";
+    for (int i = 0; i < arg_shape.size(); ++i) {
+      arg_num *= arg_shape[i];
+      os << arg_shape[i] << ",";
+    }
+    os << "}";
+    float sum = 0.;
+    std::ofstream out(FLAGS_arg_name + ".txt");
+    for (size_t i = 0; i < arg_num; ++i) {
+      sum += arg_tensor->data<float>()[i];
+      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+    }
+    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
+              << ", mean value is " << sum * 1. / arg_num;
+  }
 }
 #endif
 
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a00646f4e11b68f0233a8b6009fbf847e9d50d63
--- /dev/null
+++ b/lite/api/opt.cc
@@ -0,0 +1,460 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest.h>
+#endif
+// "supported_kernel_op_info.h", "all_kernel_faked.cc" and "kernel_src_map.h"
+// are created automatically during opt's compiling period
+#include <iomanip>
+#include "all_kernel_faked.cc"  // NOLINT
+#include "kernel_src_map.h"     // NOLINT
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/version.h"
+#include "lite/model_parser/compatible_pb.h"
+#include "lite/model_parser/pb/program_desc.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
+#include "supported_kernel_op_info.h"  // NOLINT
+
+DEFINE_string(model_dir,
+              "",
+              "path of the model. This option will be ignored if model_file "
+              "and param_file are exist");
+DEFINE_string(model_filename,
+              "",
+              "model topo filename of the model in models set. This option"
+              " will be used to specific tailoring");
+DEFINE_string(param_filename,
+              "",
+              "model param filename of the model in models set. This option"
+              " will be used to specific tailoring");
+DEFINE_string(model_set_dir,
+              "",
+              "path of the models set. This option will be used to specific"
+              " tailoring");
+DEFINE_string(model_file, "", "model file path of the combined-param model");
+DEFINE_string(param_file, "", "param file path of the combined-param model");
+DEFINE_string(
+    optimize_out_type,
+    "protobuf",
+    "store type of the output optimized model. protobuf/naive_buffer");
+DEFINE_bool(display_kernels, false, "Display kernel information");
+DEFINE_bool(record_tailoring_info,
+            false,
+            "Record kernels and operators information of the optimized model "
+            "for tailoring compiling, information are stored into optimized "
+            "model path as hidden files");
+DEFINE_string(optimize_out, "", "path of the output optimized model");
+DEFINE_string(valid_targets,
+              "arm",
+              "The targets this model optimized for, should be one of (arm, "
+              "opencl, x86), splitted by space");
+DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
+DEFINE_bool(print_supported_ops,
+            false,
+            "Print supported operators on the inputed target");
+DEFINE_bool(print_all_ops,
+            false,
+            "Print all the valid operators of Paddle-Lite");
+DEFINE_bool(print_model_ops, false, "Print operators in the input model");
+
+namespace paddle {
+namespace lite_api {
+//! Display the kernel information.
+void DisplayKernels() {
+  LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString();
+}
+
+std::vector<Place> ParserValidPlaces() {
+  std::vector<Place> valid_places;
+  auto target_reprs = lite::Split(FLAGS_valid_targets, ",");
+  for (auto& target_repr : target_reprs) {
+    if (target_repr == "arm") {
+      valid_places.emplace_back(TARGET(kARM));
+    } else if (target_repr == "opencl") {
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
+    } else if (target_repr == "x86") {
+      valid_places.emplace_back(TARGET(kX86));
+    } else if (target_repr == "npu") {
+      valid_places.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "xpu") {
+      valid_places.emplace_back(TARGET(kXPU));
+    } else {
+      LOG(FATAL) << lite::string_format(
+          "Wrong target '%s' found, please check the command flag "
+          "'valid_targets'",
+          target_repr.c_str());
+    }
+  }
+
+  CHECK(!valid_places.empty())
+      << "At least one target should be set, should set the "
+         "command argument 'valid_targets'";
+
+  if (FLAGS_prefer_int8_kernel) {
+    LOG(WARNING) << "Int8 mode is only support by ARM target";
+    valid_places.insert(valid_places.begin(),
+                        Place{TARGET(kARM), PRECISION(kInt8)});
+  }
+  return valid_places;
+}
+
+void RunOptimize(const std::string& model_dir,
+                 const std::string& model_file,
+                 const std::string& param_file,
+                 const std::string& optimize_out,
+                 const std::string& optimize_out_type,
+                 const std::vector<Place>& valid_places,
+                 bool record_tailoring_info) {
+  if (!model_file.empty() && !param_file.empty()) {
+    LOG(WARNING)
+        << "Load combined-param model. Option model_dir will be ignored";
+  }
+
+  lite_api::CxxConfig config;
+  config.set_model_dir(model_dir);
+  config.set_model_file(model_file);
+  config.set_param_file(param_file);
+  config.set_valid_places(valid_places);
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  LiteModelType model_type;
+  if (optimize_out_type == "protobuf") {
+    model_type = LiteModelType::kProtobuf;
+  } else if (optimize_out_type == "naive_buffer") {
+    model_type = LiteModelType::kNaiveBuffer;
+  } else {
+    LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
+  }
+
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
+  predictor->SaveOptimizedModel(
+      optimize_out, model_type, record_tailoring_info);
+  if (record_tailoring_info) {
+    LOG(INFO) << "Record the information of tailored model into :"
+              << optimize_out;
+  }
+}
+
+void CollectModelMetaInfo(const std::string& output_dir,
+                          const std::vector<std::string>& models,
+                          const std::string& filename) {
+  std::set<std::string> total;
+  for (const auto& name : models) {
+    std::string model_path =
+        lite::Join<std::string>({output_dir, name, filename}, "/");
+    auto lines = lite::ReadLines(model_path);
+    total.insert(lines.begin(), lines.end());
+  }
+  std::string output_path =
+      lite::Join<std::string>({output_dir, filename}, "/");
+  lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
+                   output_path);
+}
+void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
+  std::vector<std::string> targets = {"kHost",
+                                      "kX86",
+                                      "kCUDA",
+                                      "kARM",
+                                      "kOpenCL",
+                                      "kFPGA",
+                                      "kNPU",
+                                      "kXPU",
+                                      "kAny",
+                                      "kUnk"};
+  int maximum_optype_length = 0;
+  for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
+    maximum_optype_length = it->first.size() > maximum_optype_length
+                                ? it->first.size()
+                                : maximum_optype_length;
+  }
+  std::cout << std::setiosflags(std::ios::internal);
+  std::cout << std::setw(maximum_optype_length) << "OP_name";
+  for (int i = 0; i < targets.size(); i++) {
+    std::cout << std::setw(10) << targets[i].substr(1);
+  }
+  std::cout << std::endl;
+  if (valid_ops.empty()) {
+    for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
+      std::cout << std::setw(maximum_optype_length) << it->first;
+      auto ops_valid_places = it->second;
+      for (int i = 0; i < targets.size(); i++) {
+        if (std::find(ops_valid_places.begin(),
+                      ops_valid_places.end(),
+                      targets[i]) != ops_valid_places.end()) {
+          std::cout << std::setw(10) << "Y";
+        } else {
+          std::cout << std::setw(10) << " ";
+        }
+      }
+      std::cout << std::endl;
+    }
+  } else {
+    for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) {
+      std::cout << std::setw(maximum_optype_length) << *op;
+      // Check: If this kernel doesn't match any operator, we will skip it.
+      if (supported_ops.find(*op) == supported_ops.end()) {
+        continue;
+      }
+      // Print OP info.
+      auto ops_valid_places = supported_ops.at(*op);
+      for (int i = 0; i < targets.size(); i++) {
+        if (std::find(ops_valid_places.begin(),
+                      ops_valid_places.end(),
+                      targets[i]) != ops_valid_places.end()) {
+          std::cout << std::setw(10) << "Y";
+        } else {
+          std::cout << std::setw(10) << " ";
+        }
+      }
+      std::cout << std::endl;
+    }
+  }
+}
+/// Print help information
+void PrintHelpInfo() {
+  // at least one argument should be inputed
+  const std::string opt_version = lite::version();
+  const char help_info[] =
+      "At least one argument should be inputed. Valid arguments are listed "
+      "below:\n"
+      "  Arguments of model optimization:\n"
+      "        `--model_dir=<model_param_dir>`\n"
+      "        `--model_file=<model_path>`\n"
+      "        `--param_file=<param_path>`\n"
+      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
+      "        `--optimize_out=<output_optimize_model_dir>`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--prefer_int8_kernel=(true|false)`\n"
+      "        `--record_tailoring_info=(true|false)`\n"
+      "  Arguments of model checking and ops information:\n"
+      "        `--print_all_ops=true`   Display all the valid operators of "
+      "Paddle-Lite\n"
+      "        `--print_supported_ops=true  "
+      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "  Display valid operators of input targets\n"
+      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
+      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "  Display operators in the input model\n";
+  std::cout << "opt version:" << opt_version << std::endl
+            << help_info << std::endl;
+  exit(1);
+}
+
+// Parse Input command
+void ParseInputCommand() {
+  if (FLAGS_print_all_ops) {
+    std::cout << "All OPs supported by Paddle-Lite: " << supported_ops.size()
+              << " ops in total." << std::endl;
+    PrintOpsInfo();
+    exit(1);
+  } else if (FLAGS_print_supported_ops) {
+    auto valid_places = paddle::lite_api::ParserValidPlaces();
+    // get valid_targets string
+    std::vector<TargetType> target_types = {};
+    for (int i = 0; i < valid_places.size(); i++) {
+      target_types.push_back(valid_places[i].target);
+    }
+    std::string targets_str = TargetToStr(target_types[0]);
+    for (int i = 1; i < target_types.size(); i++) {
+      targets_str = targets_str + TargetToStr(target_types[i]);
+    }
+
+    std::cout << "Supported OPs on '" << targets_str << "': " << std::endl;
+    target_types.push_back(TARGET(kHost));
+    target_types.push_back(TARGET(kUnk));
+
+    std::set<std::string> valid_ops;
+    for (int i = 0; i < target_types.size(); i++) {
+      auto ops = supported_ops_target[static_cast<int>(target_types[i])];
+      valid_ops.insert(ops.begin(), ops.end());
+    }
+    PrintOpsInfo(valid_ops);
+    exit(1);
+  }
+}
+// test whether this model is supported
+void CheckIfModelSupported() {
+  // 1. parse valid places and valid targets
+  auto valid_places = paddle::lite_api::ParserValidPlaces();
+  // set valid_ops
+  auto valid_ops = supported_ops_target[static_cast<int>(TARGET(kHost))];
+  auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
+  valid_ops.insert(
+      valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
+  for (int i = 0; i < valid_places.size(); i++) {
+    auto target = valid_places[i].target;
+    auto ops = supported_ops_target[static_cast<int>(target)];
+    valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
+  }
+  // get valid ops
+  std::set<std::string> valid_ops_set(valid_ops.begin(), valid_ops.end());
+
+  // 2.Load model into program to get ops in model
+  std::string prog_path = FLAGS_model_dir + "/__model__";
+  if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) {
+    prog_path = FLAGS_model_file;
+  }
+  lite::cpp::ProgramDesc cpp_prog;
+  framework::proto::ProgramDesc pb_proto_prog =
+      *lite::LoadProgram(prog_path, false);
+  lite::pb::ProgramDesc pb_prog(&pb_proto_prog);
+  // Transform to cpp::ProgramDesc
+  lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog);
+
+  std::set<std::string> unsupported_ops;
+  std::set<std::string> input_model_ops;
+  for (int index = 0; index < cpp_prog.BlocksSize(); index++) {
+    auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
+    for (size_t i = 0; i < current_block->OpsSize(); ++i) {
+      auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
+      auto op_type = op_desc.Type();
+      input_model_ops.insert(op_type);
+      if (valid_ops_set.count(op_type) == 0) {
+        unsupported_ops.insert(op_type);
+      }
+    }
+  }
+  // 3. Print ops_info of input model and check if this model is supported
+  if (FLAGS_print_model_ops) {
+    std::cout << "OPs in the input model include:\n";
+    PrintOpsInfo(input_model_ops);
+  }
+  if (!unsupported_ops.empty()) {
+    std::string unsupported_ops_str = *unsupported_ops.begin();
+    for (auto op_str = ++unsupported_ops.begin();
+         op_str != unsupported_ops.end();
+         op_str++) {
+      unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
+    }
+    std::vector<TargetType> targets = {};
+    for (int i = 0; i < valid_places.size(); i++) {
+      targets.push_back(valid_places[i].target);
+    }
+    std::sort(targets.begin(), targets.end());
+    targets.erase(unique(targets.begin(), targets.end()), targets.end());
+    std::string targets_str = TargetToStr(targets[0]);
+    for (int i = 1; i < targets.size(); i++) {
+      targets_str = targets_str + "," + TargetToStr(targets[i]);
+    }
+
+    LOG(ERROR) << "Error: This model is not supported, because "
+               << unsupported_ops.size() << " ops are not supported on '"
+               << targets_str << "'. These unsupported ops are: '"
+               << unsupported_ops_str << "'.";
+    exit(1);
+  }
+  if (FLAGS_print_model_ops) {
+    std::cout << "Paddle-Lite supports this model!" << std::endl;
+    exit(1);
+  }
+}
+
+void Main() {
+  if (FLAGS_display_kernels) {
+    DisplayKernels();
+    exit(0);
+  }
+
+  auto valid_places = ParserValidPlaces();
+  if (FLAGS_model_set_dir == "") {
+    RunOptimize(FLAGS_model_dir,
+                FLAGS_model_file,
+                FLAGS_param_file,
+                FLAGS_optimize_out,
+                FLAGS_optimize_out_type,
+                valid_places,
+                FLAGS_record_tailoring_info);
+    return;
+  }
+
+  if (!FLAGS_record_tailoring_info) {
+    LOG(WARNING) << "--model_set_dir option only be used with "
+                    "--record_tailoring_info=true together";
+    return;
+  }
+
+  auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true);
+  if (model_dirs.size() == 0) {
+    LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model";
+  }
+  // Optimize models in FLAGS_model_set_dir
+  for (const auto& name : model_dirs) {
+    std::string input_model_dir =
+        lite::Join<std::string>({FLAGS_model_set_dir, name}, "/");
+    std::string output_model_dir =
+        lite::Join<std::string>({FLAGS_optimize_out, name}, "/");
+
+    std::string model_file = "";
+    std::string param_file = "";
+
+    if (FLAGS_model_filename != "" && FLAGS_param_filename != "") {
+      model_file =
+          lite::Join<std::string>({input_model_dir, FLAGS_model_filename}, "/");
+      param_file =
+          lite::Join<std::string>({input_model_dir, FLAGS_param_filename}, "/");
+    }
+
+    LOG(INFO) << "Start optimize model: " << input_model_dir;
+    RunOptimize(input_model_dir,
+                model_file,
+                param_file,
+                output_model_dir,
+                FLAGS_optimize_out_type,
+                valid_places,
+                FLAGS_record_tailoring_info);
+    LOG(INFO) << "Optimize done. ";
+  }
+
+  // Collect all models information
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_LIST_NAME);
+  CollectModelMetaInfo(FLAGS_optimize_out,
+                       model_dirs,
+                       lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
+}
+
+}  // namespace lite_api
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  // If there is none input argument, print help info.
+  if (argc < 2) {
+    paddle::lite_api::PrintHelpInfo();
+  }
+  google::ParseCommandLineFlags(&argc, &argv, false);
+  paddle::lite_api::ParseInputCommand();
+  paddle::lite_api::CheckIfModelSupported();
+  paddle::lite_api::Main();
+  return 0;
+}
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index f148096bb69a3a249521bcb847d5beae3f8297f9..9f071cf7780e27defdd1fcd6be02844618165fb6 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -93,7 +93,7 @@ void Tensor::CopyFromCpu(const T *src_data) {
   }
 }
 template <typename T>
-void Tensor::CopyToCpu(T *data) {
+void Tensor::CopyToCpu(T *data) const {
   const T *src_data = tensor(raw_tensor_)->data<T>();
   int64_t num = tensor(raw_tensor_)->numel();
   CHECK(num > 0) << "You should call Resize interface first";
@@ -121,12 +121,13 @@ template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *);
 template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *);
 template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *);
+template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
 template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
 
-template void Tensor::CopyToCpu(int8_t *);
-template void Tensor::CopyToCpu(float *);
-template void Tensor::CopyToCpu(int *);
+template void Tensor::CopyToCpu(int8_t *) const;
+template void Tensor::CopyToCpu(float *) const;
+template void Tensor::CopyToCpu(int *) const;
 
 shape_t Tensor::shape() const {
   return ctensor(raw_tensor_)->dims().Vectorize();
@@ -189,5 +190,27 @@ void ConfigBase::set_threads(int threads) {
 #endif
 }
 
+// set model data in combined format, `set_model_from_file` refers to loading
+// model from file, set_model_from_buffer refers to loading model from memory
+// buffer
+void MobileConfig::set_model_from_file(const std::string &x) {
+  lite_model_file_ = x;
+}
+void MobileConfig::set_model_from_buffer(const std::string &x) {
+  lite_model_file_ = x;
+  model_from_memory_ = true;
+}
+void MobileConfig::set_model_buffer(const char *model_buffer,
+                                    size_t model_buffer_size,
+                                    const char *param_buffer,
+                                    size_t param_buffer_size) {
+  LOG(WARNING) << "warning: `set_model_buffer` will be abandened in "
+                  "release/v3.0.0, new method `set_model_from_buffer(const "
+                  "std::string &x)` is recommended.";
+  model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
+  param_buffer_ = std::string(param_buffer, param_buffer + param_buffer_size);
+  model_from_memory_ = true;
+}
+
 }  // namespace lite_api
 }  // namespace paddle
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 42b455da811fe1a21277d38f2e1237000276b1ff..307eeb74e8b4cdc3b2d6188eb18490e4dcf89b8f 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -49,7 +49,7 @@ struct LITE_API Tensor {
   void CopyFromCpu(const T* data);
 
   template <typename T>
-  void CopyToCpu(T* data);
+  void CopyToCpu(T* data) const;
   /// Shape of the tensor.
   shape_t shape() const;
   TargetType target() const;
@@ -133,6 +133,9 @@ class LITE_API CxxConfig : public ConfigBase {
   std::string model_file_;
   std::string param_file_;
   bool model_from_memory_{false};
+#ifdef LITE_WITH_X86
+  int x86_math_library_math_threads_ = 1;
+#endif
 
  public:
   void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -151,27 +154,54 @@ class LITE_API CxxConfig : public ConfigBase {
   std::string model_file() const { return model_file_; }
   std::string param_file() const { return param_file_; }
   bool model_from_memory() const { return model_from_memory_; }
+
+#ifdef LITE_WITH_X86
+  void set_x86_math_library_num_threads(int threads) {
+    x86_math_library_math_threads_ = threads;
+  }
+  int x86_math_library_num_threads() const {
+    return x86_math_library_math_threads_;
+  }
+#endif
 };
 
 /// MobileConfig is the config for the light weight predictor, it will skip
 /// IR optimization or other unnecessary stages.
 class LITE_API MobileConfig : public ConfigBase {
+  // whether to load data from memory. Model data will be loaded from memory
+  // buffer if model_from_memory_ is true.
+  bool model_from_memory_{false};
+
+  // model data readed from file or memory buffer in combined format.
+  std::string lite_model_file_;
+
+  // NOTE: This is a deprecated variable and will be removed in latter release.
   std::string model_buffer_;
   std::string param_buffer_;
-  bool model_from_memory_{false};
 
  public:
+  // set model data in combined format, `set_model_from_file` refers to loading
+  // model from file, set_model_from_buffer refers to loading model from memory
+  // buffer
+  void set_model_from_file(const std::string& x);
+  void set_model_from_buffer(const std::string& x);
+  // return model data in lite_model_file_, which is in combined format.
+  const std::string& lite_model_file() const { return lite_model_file_; }
+
+  // return model_from_memory_, which indicates whether to load model from
+  // memory buffer.
+  bool model_from_memory() const { return model_from_memory_; }
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
   void set_model_buffer(const char* model_buffer,
                         size_t model_buffer_size,
                         const char* param_buffer,
-                        size_t param_buffer_size) {
-    model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
-    param_buffer_ = std::string(param_buffer, param_buffer + param_buffer_size);
-    model_from_memory_ = true;
-  }
+                        size_t param_buffer_size);
 
-  bool model_from_memory() const { return model_from_memory_; }
+  // NOTE: This is a deprecated API and will be removed in latter release.
   const std::string& model_buffer() const { return model_buffer_; }
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
   const std::string& param_buffer() const { return param_buffer_; }
 };
 
diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc
index 69d544c3decac9f312bc9eb03cdc6c3702c5032b..9213a24e5c0614550a098c4de8d97b6cf6695177 100644
--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -72,7 +72,7 @@ TEST(CxxApi, run) {
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(LightApi, run) {
   lite_api::MobileConfig config;
-  config.set_model_dir(FLAGS_model_dir + ".opt2.naive");
+  config.set_model_from_file(FLAGS_model_dir + ".opt2.naive.nb");
 
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
@@ -109,16 +109,11 @@ TEST(LightApi, run) {
 // Demo2 for Loading model from memory
 TEST(MobileConfig, LoadfromMemory) {
   // Get naive buffer
-  auto model_path = std::string(FLAGS_model_dir) + ".opt2.naive/__model__.nb";
-  auto params_path = std::string(FLAGS_model_dir) + ".opt2.naive/param.nb";
-  std::string model_buffer = lite::ReadFile(model_path);
-  size_t size_model = model_buffer.length();
-  std::string params_buffer = lite::ReadFile(params_path);
-  size_t size_params = params_buffer.length();
+  auto model_file = std::string(FLAGS_model_dir) + ".opt2.naive.nb";
+  std::string model_buffer = lite::ReadFile(model_file);
   // set model buffer and run model
   lite_api::MobileConfig config;
-  config.set_model_buffer(
-      model_buffer.c_str(), size_model, params_buffer.c_str(), size_params);
+  config.set_model_from_buffer(model_buffer);
 
   auto predictor = lite_api::CreatePaddlePredictor(config);
   auto input_tensor = predictor->GetInput(0);
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index 894d839185ea9e1b6b47b87c398f249f044c2b51..2cced919e601f8ecb79ce262a2b083d5b6862da9 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -55,7 +55,8 @@ const std::string& TargetToStr(TargetType target) {
                                               "any",
                                               "fpga",
                                               "npu",
-                                              "xpu"};
+                                              "xpu",
+                                              "bm"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -77,7 +78,8 @@ const std::string& PrecisionToStr(PrecisionType precision) {
 }
 
 const std::string& DataLayoutToStr(DataLayoutType layout) {
-  static const std::string datalayout2string[] = {"unk", "NCHW", "any", "NHWC"};
+  static const std::string datalayout2string[] = {
+      "unk", "NCHW", "any", "NHWC", "ImageDefault", "ImageFolder", "ImageNW"};
   auto x = static_cast<int>(layout);
   CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
   return datalayout2string[x];
@@ -93,7 +95,8 @@ const std::string& TargetRepr(TargetType target) {
                                               "kAny",
                                               "kFPGA",
                                               "kNPU",
-                                              "kXPU"};
+                                              "kXPU",
+                                              "kBM"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -115,8 +118,13 @@ const std::string& PrecisionRepr(PrecisionType precision) {
 }
 
 const std::string& DataLayoutRepr(DataLayoutType layout) {
-  static const std::string datalayout2string[] = {
-      "kUnk", "kNCHW", "kAny", "kNHWC"};
+  static const std::string datalayout2string[] = {"kUnk",
+                                                  "kNCHW",
+                                                  "kAny",
+                                                  "kNHWC",
+                                                  "kImageDefault",
+                                                  "kImageFolder",
+                                                  "kImageNW"};
   auto x = static_cast<int>(layout);
   CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
   return datalayout2string[x];
@@ -129,6 +137,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                                TARGET(kOpenCL),
                                                TARGET(kNPU),
                                                TARGET(kXPU),
+                                               TARGET(kBM),
                                                TARGET(kFPGA)});
   if (target == TARGET(kAny)) {
     return valid_set;
@@ -146,8 +155,12 @@ std::set<PrecisionType> ExpandValidPrecisions(PrecisionType precision) {
 }
 
 std::set<DataLayoutType> ExpandValidLayouts(DataLayoutType layout) {
-  static const std::set<DataLayoutType> valid_set(
-      {DATALAYOUT(kNCHW), DATALAYOUT(kAny), DATALAYOUT(kNHWC)});
+  static const std::set<DataLayoutType> valid_set({DATALAYOUT(kNCHW),
+                                                   DATALAYOUT(kAny),
+                                                   DATALAYOUT(kNHWC),
+                                                   DATALAYOUT(kImageDefault),
+                                                   DATALAYOUT(kImageFolder),
+                                                   DATALAYOUT(kImageNW)});
   if (layout == DATALAYOUT(kAny)) {
     return valid_set;
   }
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index 07284be095c05e5dfa069b0973d5982cf1f07c8a..7da52adc7fb6fdd70de3b098508e4622496bed7d 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -52,8 +52,9 @@ enum class TargetType : int {
   kFPGA = 7,
   kNPU = 8,
   kXPU = 9,
+  kBM = 10,
   kAny = 6,  // any target
-  NUM = 10,  // number of fields.
+  NUM = 11,  // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,
@@ -71,8 +72,11 @@ enum class DataLayoutType : int {
   kUnk = 0,
   kNCHW = 1,
   kNHWC = 3,
-  kAny = 2,  // any data layout
-  NUM = 4,   // number of fields.
+  kImageDefault = 4,  // for opencl image2d
+  kImageFolder = 5,   // for opencl image2d
+  kImageNW = 6,       // for opencl image2d
+  kAny = 2,           // any data layout
+  NUM = 7,            // number of fields.
 };
 
 typedef enum {
@@ -112,6 +116,34 @@ static size_t PrecisionTypeLength(PrecisionType type) {
   }
 }
 
+template <typename T>
+struct PrecisionTypeTrait {
+  constexpr static PrecisionType Type() { return PrecisionType::kUnk; }
+};
+
+#define _ForEachPrecisionTypeHelper(callback, cpp_type, precision_type) \
+  callback(cpp_type, ::paddle::lite_api::PrecisionType::precision_type);
+
+#define _ForEachPrecisionType(callback)                   \
+  _ForEachPrecisionTypeHelper(callback, bool, kBool);     \
+  _ForEachPrecisionTypeHelper(callback, float, kFloat);   \
+  _ForEachPrecisionTypeHelper(callback, int8_t, kInt8);   \
+  _ForEachPrecisionTypeHelper(callback, int16_t, kInt16); \
+  _ForEachPrecisionTypeHelper(callback, int, kInt32);     \
+  _ForEachPrecisionTypeHelper(callback, int64_t, kInt64);
+
+#define DefinePrecisionTypeTrait(cpp_type, precision_type)           \
+  template <>                                                        \
+  struct PrecisionTypeTrait<cpp_type> {                              \
+    constexpr static PrecisionType Type() { return precision_type; } \
+  }
+
+_ForEachPrecisionType(DefinePrecisionTypeTrait);
+
+#undef _ForEachPrecisionTypeHelper
+#undef _ForEachPrecisionType
+#undef DefinePrecisionTypeTrait
+
 #define TARGET(item__) paddle::lite_api::TargetType::item__
 #define PRECISION(item__) paddle::lite_api::PrecisionType::item__
 #define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 70355fdf890eb63cd5bedd5bab42a2dd69af0927..943760d30742b74a0fe9150e4c2d8c8bb5dbc52a 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -20,7 +20,6 @@ USE_MIR_PASS(static_kernel_pick_pass);
 USE_MIR_PASS(variable_place_inference_pass);
 USE_MIR_PASS(type_target_cast_pass);
 USE_MIR_PASS(generate_program_pass);
-USE_MIR_PASS(subgraph_program_pass);
 
 USE_MIR_PASS(io_copy_kernel_pick_pass);
 USE_MIR_PASS(argument_type_display_pass);
@@ -32,11 +31,17 @@ USE_MIR_PASS(lite_fc_fuse_pass);
 USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
 USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
 USE_MIR_PASS(lite_interpolate_fuse_pass);
+USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
 USE_MIR_PASS(identity_scale_eliminate_pass);
 USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
 USE_MIR_PASS(lite_conv_activation_fuse_pass);
+USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
 USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass);
 USE_MIR_PASS(lite_quant_dequant_fuse_pass);
 USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
 USE_MIR_PASS(memory_optimize_pass);
+USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
+USE_MIR_PASS(npu_subgraph_pass);
+USE_MIR_PASS(xpu_subgraph_pass);
+USE_MIR_PASS(weight_quantization_preprocess_pass);
diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt
index 178f167e6a1627d01df13b2e105e0af36b20601a..eabb6b150b93a722282118c3932676cd1aee5da8 100644
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -4,3 +4,6 @@ if (NOT LITE_ON_TINY_PUBLISH)
 endif()
 
 lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+if (LITE_ON_TINY_PUBLISH)
+   set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
+endif()
diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc
index 2df2e8f8f8aa56bb71b0e1cb293df2ecbbafd0bb..2dfe0c49490ecd13e8a3ce480807bdf3875348b7 100644
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -116,6 +116,8 @@ void BindLiteMobileConfig(py::module *m) {
   py::class_<MobileConfig> mobile_config(*m, "MobileConfig");
 
   mobile_config.def(py::init<>())
+      .def("set_model_from_file", &MobileConfig::set_model_from_file)
+      .def("set_model_from_buffer", &MobileConfig::set_model_from_buffer)
       .def("set_model_dir", &MobileConfig::set_model_dir)
       .def("model_dir", &MobileConfig::model_dir)
       .def("set_model_buffer", &MobileConfig::set_model_buffer)
@@ -165,6 +167,9 @@ void BindLitePlace(py::module *m) {
   py::enum_<DataLayoutType>(*m, "DataLayoutType")
       .value("NCHW", DataLayoutType::kNCHW)
       .value("NHWC", DataLayoutType::kNHWC)
+      .value("ImageDefault", DataLayoutType::kImageDefault)
+      .value("ImageFolder", DataLayoutType::kImageFolder)
+      .value("ImageNW", DataLayoutType::kImageNW)
       .value("Any", DataLayoutType::kAny);
 
   // Place
diff --git a/lite/api/resnet50_test_fpga.cc b/lite/api/resnet50_test_fpga.cc
index ab647f96998f1c0e73476369611218d0a7930c57..75e6f0cbbc43c3cd7eb9bfa89bc004554ea6f85b 100644
--- a/lite/api/resnet50_test_fpga.cc
+++ b/lite/api/resnet50_test_fpga.cc
@@ -31,11 +31,7 @@ TEST(ResNet50, test) {
   std::vector<Place> valid_places(
       {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});
 
-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
-                  valid_places);
+  predictor.Build(FLAGS_model_dir, "", "", valid_places);
 
   auto* input_tensor = predictor.GetInput(0);
   input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
diff --git a/lite/api/test_resnet50_lite_bm.cc b/lite/api/test_resnet50_lite_bm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62a58704f4245b8618540ea7109447dd99d0bfea
--- /dev/null
+++ b/lite/api/test_resnet50_lite_bm.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <fstream>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+DEFINE_string(input_img_txt_path,
+              "",
+              "if set input_img_txt_path, read the img filename as input.");
+
+namespace paddle {
+namespace lite {
+
+void TestModel(const std::vector<Place>& valid_places) {
+  lite::Predictor predictor;
+  std::vector<std::string> passes;
+  passes.push_back("bm_subgraph_pass");
+  predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  if (FLAGS_input_img_txt_path.empty()) {
+    for (int i = 0; i < item_size; i++) {
+      data[i] = 1;
+    }
+  } else {
+    std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
+    if (!fs.is_open()) {
+      LOG(FATAL) << "open input_img_txt error.";
+    }
+    for (int i = 0; i < item_size; i++) {
+      fs >> data[i];
+    }
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  auto* out = predictor.GetOutput(0);
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+
+  auto* out_data = out->data<float>();
+  FILE* fp = fopen("result.txt", "wb");
+  for (int i = 0; i < out->numel(); i++) {
+    fprintf(fp, "%f\n", out_data[i]);
+  }
+  fclose(fp);
+}
+
+TEST(ResNet50, test_bm) {
+  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+
+  TestModel(valid_places);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/api/test_step_rnn_lite_x86.cc b/lite/api/test_step_rnn_lite_x86.cc
index c483373dc745f6520d51ece3936448ada71990d3..013fd82b19bc22ace22184389249a7b2d9bf237e 100644
--- a/lite/api/test_step_rnn_lite_x86.cc
+++ b/lite/api/test_step_rnn_lite_x86.cc
@@ -12,20 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
@@ -44,6 +30,9 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
   std::string model_dir = FLAGS_model_dir;
   lite_api::CxxConfig config;
   config.set_model_dir(model_dir);
+#ifdef LITE_WITH_X86
+  config.set_x86_math_library_num_threads(1);
+#endif
   config.set_valid_places({lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
                            lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
                            lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
@@ -62,7 +51,7 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
                                            "micro_video_id",
                                            "vertical_type_id"};
 
-  for (int i = 0; i < target_names.size(); ++i) {
+  for (size_t i = 0; i < target_names.size(); ++i) {
     auto input_tensor = predictor->GetInput(i);
     int size = 0;
     if (i == 6 || i == 8) {
@@ -87,8 +76,7 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
     predictor->Run();
   }
 
-  //  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+  LOG(INFO) << "warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
             << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
             << " ms in average.";
 
@@ -99,8 +87,8 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
 
   std::vector<int64_t> out_shape = out->shape();
 
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
       EXPECT_NEAR(
           out->data<float>()[j + (out_shape[1] * i)], results[i][j], 1e-6);
     }
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index dec63e6efa0e4c4548646ebdd6f6de24f046d6d0..e3517464812a24c9911e824c53841efc05dd2bc5 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -6,3 +6,4 @@ add_subdirectory(fpga)
 add_subdirectory(host)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(bm)
diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt
index cbbcf49a5fd55dabd6b072bc6b3b2e3f9bb91a13..6f6f7e7aa71ba5067d831a2bcc2b7b933205fbe0 100644
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -57,9 +57,10 @@ endif()
 
 if (NOT HAS_ARM_MATH_LIB_DIR)
   # TODO(xxx): seperate them and do not deps proto, eigen3
-  cc_library(math_arm SRCS  
-      funcs.cc 
+  cc_library(math_arm SRCS
+      funcs.cc
       packed_sgemm.cc
+      packed_sgemm_c4.cc
       sgemm.cc
       gemm_prepacked_int8.cc
       gemm_s8.cc
@@ -67,25 +68,26 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       gemv_arm_int8.cc
       conv3x3s1_direct_fp32.cc
       conv3x3s2_direct_fp32.cc
-      conv3x3s1_depthwise_fp32.cc
-      conv3x3s2_depthwise_fp32.cc
+      conv3x3s1p01_depthwise_fp32.cc
+      conv3x3s2p01_depthwise_fp32.cc
+      conv3x3s1px_depthwise_fp32.cc
+      conv3x3s2px_depthwise_fp32.cc
       conv3x3s1_direct_int8.cc
       conv3x3s2_direct_int8.cc
       conv3x3s1_depthwise_int8.cc
       conv3x3s2_depthwise_int8.cc
       conv5x5s1_depthwise_int8.cc
       conv5x5s1_depthwise_fp32.cc
+      conv5x5s2_depthwise_int8.cc
       conv5x5s2_depthwise_fp32.cc
-      conv_depthwise_3x3p0.cc
-      conv_depthwise_3x3p1.cc
-      conv_depthwise_3x3s1.cc
-      conv_depthwise_3x3s2.cc
+      conv3x3_winograd_fp32_c4.cc
       conv_winograd_3x3.cc
       conv_impl.cc
-      softmax.cc 
+      softmax.cc
       scale.cc
       pooling.cc
       elementwise.cc
+      layout.cc
       lrn.cc
       decode_bboxes.cc
       concat.cc
@@ -119,6 +121,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       stack.cc
       affine_channel.cc
       anchor_generator.cc
+      split_merge_lod_tenosr.cc
+      reduce_prod.cc
       DEPS ${lite_kernel_deps} context tensor)
 endif()
- 
diff --git a/lite/backends/arm/math/col_im_transform.cc b/lite/backends/arm/math/col_im_transform.cc
index b5d2c6af13cc1dd864eaac6cb6589cc879f029fe..38be1d689dd47ab59baf417e40989a91bb6366e0 100644
--- a/lite/backends/arm/math/col_im_transform.cc
+++ b/lite/backends/arm/math/col_im_transform.cc
@@ -32,8 +32,10 @@ void col2im<float>(const float* data_col,
                    const int width,
                    const int kernel_h,
                    const int kernel_w,
-                   const int pad_h,
-                   const int pad_w,
+                   const int pad_h0,
+                   const int pad_h1,
+                   const int pad_w0,
+                   const int pad_w1,
                    const int stride_h,
                    const int stride_w,
                    const int dilation_h,
@@ -41,19 +43,22 @@ void col2im<float>(const float* data_col,
                    float* data_im) {
   memset(data_im, 0, height * width * channels * sizeof(float));
   const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
   const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
   const int channel_size = height * width;
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_h0 + kernel_row * dilation_h;
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             data_col += output_w;
           } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_w0 + kernel_col * dilation_w;
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 data_im[input_row * width + input_col] += *data_col;
diff --git a/lite/backends/arm/math/col_im_transform.h b/lite/backends/arm/math/col_im_transform.h
index 8560679d7f4091c4cb424b54e54a42cf6e7e8905..e3e32c4715ade10972f77e0c4d5a2cd4d16b4725 100644
--- a/lite/backends/arm/math/col_im_transform.h
+++ b/lite/backends/arm/math/col_im_transform.h
@@ -26,8 +26,10 @@ void col2im(const Dtype* data_col,
             const int width,
             const int kernel_h,
             const int kernel_w,
-            const int pad_h,
-            const int pad_w,
+            const int pad_h0,
+            const int pad_h1,
+            const int pad_w0,
+            const int pad_w1,
             const int stride_h,
             const int stride_w,
             const int dilation_h,
diff --git a/lite/backends/arm/math/concat.cc b/lite/backends/arm/math/concat.cc
index 9b94cefa16bca0dd487ad0e4f6b88e604b694416..65f93453388d7f41d73669f583d189bec9035bb5 100644
--- a/lite/backends/arm/math/concat.cc
+++ b/lite/backends/arm/math/concat.cc
@@ -26,31 +26,32 @@ namespace math {
 void concat_func(const std::vector<lite::Tensor *> &input,
                  const int axis,
                  lite::Tensor *output) {
-  size_t num = input.size();
-  int rows = 1;
+  int64_t concat_input_size = 1;
+  int64_t num_cancats = 1;
   auto dim_0 = input[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
+  size_t num = input.size();
+  for (int i = axis + 1; i < dim_0.size(); i++) {
+    concat_input_size *= dim_0[i];
   }
-  int out_rows = rows, out_cols = 0;
-
-  std::vector<int64_t> input_cols(input.size());
-  for (int i = 0; i < num; ++i) {
-    int t_cols = input[i]->numel() / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
+  for (int i = 0; i < axis; i++) {
+    num_cancats *= dim_0[i];
   }
-
-  // computation
-  for (int k = 0; k < out_rows; ++k) {
-    float *dst_ptr = output->mutable_data<float>() + k * out_cols;
-    int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
-      int col_len = input_cols[j];
-      const float *src_prt = input[j]->data<float>() + k * col_len;
-      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
-      col_idx += col_len;
+  float *dst_ptr = output->mutable_data<float>();
+  const int out_concat_axis = output->dims()[axis];
+  int64_t offset_concat_axis = 0;
+  int64_t out_sum = out_concat_axis * concat_input_size;
+  for (int n = 0; n < num; n++) {
+    auto dims = input[n]->dims();
+    const float *src_ptr = input[n]->data<float>();
+    int64_t in_concat_axis = dims[axis];
+    float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
+    int64_t in_sum = in_concat_axis * concat_input_size;
+    for (int i = 0; i < num_cancats; i++) {
+      std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
+      dout_ptr += out_sum;
+      src_ptr += in_sum;
     }
+    offset_concat_axis += in_concat_axis;
   }
 }
 
diff --git a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1992f62bbfa9e15ab4d39565f7fe3555e17b215
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -0,0 +1,1310 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/backends/arm/math/packed_sgemm_c4.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+#include <arm_neon.h>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void input_trans_c4_8x8(const float* src,
+                        int src_stride,
+                        float* dest,
+                        int dest_stride);
+void output_trans_c4_6x8(const float* src,
+                         int src_stride,
+                         float* dest,
+                         int dest_stride);
+void output_trans_c4_post_6x8(const float* src,
+                              int src_stride,
+                              float* dest,
+                              int dest_stride,
+                              float* bias_value,
+                              bool has_relu);
+void input_trans_c4_4x4(const float* src,
+                        int src_stride,
+                        int src_h_stride,
+                        float* dest,
+                        int dest_stride,
+                        int dest_h_stride);
+void output_trans_c4_post_2x4(const float* src,
+                              int src_stride,
+                              int src_h_stride,
+                              float* dest,
+                              int dest_stride,
+                              int dest_h_stride,
+                              float* bias_value,
+                              bool has_relu);
+void weight_trans_c4_8x8(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+void weight_trans_c4_4x4(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+
+/*
+*The following function conv_compute_6x6_3x3 and conv_compute_2x2_3x3[_small] is
+*base on
+*MNN[https://github.com/alibaba/MNN]
+*
+*Copyright © 2018, Alibaba Group Holding Limited
+*/
+
+// F(6,3)
+void conv_compute_6x6_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx) {
+  auto act_param = param.activation_param;
+  const int pad_h = (*param.paddings)[0];
+  const int pad_w = (*param.paddings)[2];
+  float* tmp_work_space =
+      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
+
+  int in_n_stride = chin * hin * win;
+  int out_n_stride = chout * hout * wout;
+  int ic_stride = win * hin;
+  int oc_stride = wout * hout;
+  int ic_4 = (chin + 3) / 4;
+  int oc_4 = (chout + 3) / 4;
+
+  int tile_w = (wout + 5) / 6;
+  int tile_h = (hout + 5) / 6;
+  int size_tile = tile_h * tile_w;
+
+  int w_pad = win + pad_w * 2;
+  int h_pad = hin + pad_h * 2;
+
+  const int zero_len = w_pad;
+  float zero_ptr[zero_len];  // NOLINT
+  memset(zero_ptr, 0, zero_len * sizeof(float));
+
+  float* input_c4 = tmp_work_space;
+  int new_h_stride = w_pad * 4;
+  int new_c_stride = new_h_stride * h_pad;
+
+  int ic_4_stride = w_pad * h_pad * 4;
+  int oc_4_stride = wout * hout * 4;
+
+  int tile_block = 8;
+  int block_count = (size_tile + tile_block - 1) / tile_block;
+
+  int threads = ctx->threads();
+  float* g_tmp_data = tmp_work_space + ic_4 * new_c_stride;
+  int tmp_data_thread_stride = tile_block * (oc_4 + ic_4) * 256;
+  memset(g_tmp_data, 0, threads * tmp_data_thread_stride * sizeof(float));
+  float* g_trans_tmp_data = g_tmp_data + threads * tmp_data_thread_stride;
+  float* g_trans_remain_tmp_data = g_trans_tmp_data + threads * 256;
+
+  // begin compute
+  for (int ni = 0; ni < num; ++ni) {
+    // trans input to c4
+    for (int i = 0; i < ic_4; ++i) {
+      prepack_input_nxwc4_dw(input + ni * in_n_stride,
+                             input_c4 + i * new_c_stride,
+                             i * 4,
+                             -pad_h,
+                             hin + pad_h,
+                             -pad_w,
+                             win + pad_w,
+                             chin,
+                             win,
+                             hin,
+                             zero_ptr);
+    }
+    float* output_ptr = output + ni * out_n_stride;
+
+    const float* weight_ptr = weight;
+    const float* bias_ptr = bias;
+#pragma omp parallel for num_threads(threads)
+    for (int tbi = 0; tbi < block_count; ++tbi) {
+#ifdef ARM_WITH_OMP
+      float* tmp_data =
+          g_tmp_data + omp_get_thread_num() * tmp_data_thread_stride;
+      float* trans_tmp_data = g_trans_tmp_data + omp_get_thread_num() * 256;
+      float* trans_remain_tmp_data =
+          g_trans_remain_tmp_data + omp_get_thread_num() * 256;
+#else
+      float* tmp_data = g_tmp_data;
+      float* trans_tmp_data = g_trans_tmp_data;
+      float* trans_remain_tmp_data = g_trans_remain_tmp_data;
+#endif
+      int tile_index = tbi * tile_block;
+      int tile_remain = size_tile - tile_index;
+      int tile_count = tile_remain > tile_block ? tile_block : tile_remain;
+
+      // input trans
+      int c_gi_stride = tile_count * oc_4 * 4;
+      int b_gi_stride = tile_count * ic_4 * 4;
+      //*
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int src_x = tw_index * 6;
+        int src_y = th_index * 6;
+        int ex = src_x + 8 > w_pad ? w_pad - src_x : 8;
+        int ey = src_y + 8 > h_pad ? h_pad - src_y : 8;
+
+        float* dst_ptr = tmp_data + ti * 4;
+        const float* src_ptr = input_c4 + (src_y * w_pad + src_x) * 4;
+
+        if (ex == 8 && ey == 8) {
+          // trans input
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            for (int i = 0; i < 8; ++i) {
+              const float* ci_ptr = src_ci + i * w_pad * 4;
+              input_trans_c4_8x8(ci_ptr, 4, trans_tmp_data + i * 4, 32);
+            }
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              input_trans_c4_8x8(trans_tmp_data + i * 32,
+                                 4,
+                                 dst_ci + i * b_gi_stride * 8,
+                                 b_gi_stride);
+            }
+          }
+        } else {
+          // trans remain input
+          int x_size = ex;
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            // pad
+            memset(trans_remain_tmp_data, 0, 256 * sizeof(float));
+            if (x_size > 0) {
+              for (int yi = 0; yi < ey; ++yi) {
+                float* dst_yi = trans_remain_tmp_data + yi * 32;
+                const float* src_yi = src_ci + w_pad * yi * 4;
+                memcpy(dst_yi, src_yi, x_size * sizeof(float) * 4);
+              }
+            }
+
+            // trans
+            for (int i = 0; i < 8; ++i) {
+              float* ci_ptr = trans_remain_tmp_data + i * 32;
+              input_trans_c4_8x8(ci_ptr, 4, trans_tmp_data + i * 4, 32);
+            }
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              input_trans_c4_8x8(trans_tmp_data + i * 32,
+                                 4,
+                                 dst_ci + i * b_gi_stride * 8,
+                                 b_gi_stride);
+            }
+          }  // for ci_4
+        }
+      }
+      //*/
+      // input trans end
+      // *begin compute dot
+      // *
+      //*
+      float* dst_temp_data = tmp_data + tile_block * ic_4 * 256;
+      float* b_ptr = tmp_data;
+      int w_gi_stride = ic_4 * oc_4 * 16;
+      for (int gi = 0; gi < 64; ++gi) {
+        float* origin_C = dst_temp_data + gi * c_gi_stride;
+        float* origin_B = b_ptr + gi * b_gi_stride;
+        const float* origin_A = weight + gi * w_gi_stride;
+        sgemm_prepack_c4_small(
+            oc_4 * 4, tile_count, ic_4 * 4, origin_A, origin_B, origin_C, ctx);
+      }
+      //*/
+      //*
+      // output trans
+      float bias_value[4];
+      memset(bias_value, 0, 4 * sizeof(float));
+
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int dst_x = tw_index * 6;
+        int dst_y = th_index * 6;
+
+        int ex = dst_x + 6 > wout ? wout - dst_x : 6;
+        int ey = dst_y + 6 > hout ? hout - dst_y : 6;
+
+        float* dst_ptr = output + (dst_y * wout + dst_x) * 4;
+        float* src_ptr = dst_temp_data + ti * 4;
+
+        if (ex == 6) {
+          // trans output
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              output_trans_c4_6x8(src_ci + i * c_gi_stride * 8,
+                                  c_gi_stride,
+                                  trans_tmp_data + i * 4,
+                                  32);
+            }
+            for (int i = 0; i < ey; ++i) {
+              output_trans_c4_post_6x8(trans_tmp_data + i * 32,
+                                       4,
+                                       trans_remain_tmp_data + i * 24,
+                                       4,
+                                       bias_value,
+                                       param.fuse_relu);
+            }
+            write_to_output_c4_fp32(trans_remain_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr,
+                                    &act_param);
+          }
+        } else {
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+            // trans output
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              output_trans_c4_6x8(src_ci + i * c_gi_stride * 8,
+                                  c_gi_stride,
+                                  trans_tmp_data + i * 4,
+                                  32);
+            }
+            for (int i = 0; i < ey; ++i) {
+              output_trans_c4_post_6x8(trans_tmp_data + i * 32,
+                                       4,
+                                       trans_remain_tmp_data + i * 24,
+                                       4,
+                                       bias_value,
+                                       param.fuse_relu);
+            }
+            // copy to dest
+            memset(trans_tmp_data, 0, 144 * sizeof(float));
+            for (int i = 0; i < ey; ++i) {
+              memcpy(trans_tmp_data + i * ex * 4,
+                     trans_remain_tmp_data + i * 24,
+                     ex * sizeof(float) * 4);
+            }
+            write_to_output_c4_fp32(trans_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr,
+                                    &act_param);
+          }
+        }
+      }
+      //*/
+    }  // for block_count
+  }    // for num
+}  // conv_compute
+
+// F(2,3)
+void conv_compute_2x2_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx) {
+  auto act_param = param.activation_param;
+  const int pad_h = (*param.paddings)[0];
+  const int pad_w = (*param.paddings)[2];
+  float* tmp_work_space =
+      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
+
+  int in_n_stride = chin * hin * win;
+  int out_n_stride = chout * hout * wout;
+  int ic_stride = win * hin;
+  int oc_stride = wout * hout;
+  int ic_4 = (chin + 3) / 4;
+  int oc_4 = (chout + 3) / 4;
+
+  int tile_w = (wout + 1) / 2;
+  int tile_h = (hout + 1) / 2;
+  int size_tile = tile_h * tile_w;
+
+  int w_pad = win + pad_w * 2;
+  int h_pad = hin + pad_h * 2;
+
+  const int zero_len = w_pad;
+  float zero_ptr[zero_len];  // NOLINT
+  memset(zero_ptr, 0, zero_len * sizeof(float));
+
+  float* input_c4 = tmp_work_space;
+  int new_h_stride = w_pad * 4;
+  int new_c_stride = new_h_stride * h_pad;
+
+  int ic_4_stride = w_pad * h_pad * 4;
+  int oc_4_stride = wout * hout * 4;
+
+  int tile_block = 8;
+  int block_count = (size_tile + tile_block - 1) / tile_block;
+
+  int threads = ctx->threads();
+  float* g_tmp_data = tmp_work_space + ic_4 * new_c_stride;
+  int tmp_data_thread_stride = tile_block * (oc_4 + ic_4) * 64;
+  memset(g_tmp_data, 0, threads * tmp_data_thread_stride * sizeof(float));
+  float* g_trans_tmp_data = g_tmp_data + threads * tmp_data_thread_stride;
+  float* g_trans_remain_tmp_data = g_trans_tmp_data + threads * 64;
+
+  // begin compute
+  for (int ni = 0; ni < num; ++ni) {
+    // trans input to c4
+    for (int i = 0; i < ic_4; ++i) {
+      prepack_input_nxwc4_dw(input + ni * in_n_stride,
+                             input_c4 + i * new_c_stride,
+                             i * 4,
+                             -pad_h,
+                             hin + pad_h,
+                             -pad_w,
+                             win + pad_w,
+                             chin,
+                             win,
+                             hin,
+                             zero_ptr);
+    }
+    float* output_ptr = output + ni * out_n_stride;
+
+    const float* weight_ptr = weight;
+    const float* bias_ptr = bias;
+#pragma omp parallel for num_threads(threads)
+    for (int tbi = 0; tbi < block_count; ++tbi) {
+#ifdef ARM_WITH_OMP
+      float* tmp_data =
+          g_tmp_data + omp_get_thread_num() * tmp_data_thread_stride;
+      float* trans_tmp_data = g_trans_tmp_data + omp_get_thread_num() * 64;
+      float* trans_remain_tmp_data =
+          g_trans_remain_tmp_data + omp_get_thread_num() * 64;
+#else
+      float* tmp_data = g_tmp_data;
+      float* trans_tmp_data = g_trans_tmp_data;
+      float* trans_remain_tmp_data = g_trans_remain_tmp_data;
+#endif
+      int tile_index = tbi * tile_block;
+      int tile_remain = size_tile - tile_index;
+      int tile_count = tile_remain > tile_block ? tile_block : tile_remain;
+
+      // input trans
+      int c_gi_stride = tile_count * oc_4 * 4;
+      int b_gi_stride = tile_count * ic_4 * 4;
+      //*
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int src_x = tw_index + tw_index;
+        int src_y = th_index + th_index;
+        int ex = src_x + 4 > w_pad ? w_pad - src_x : 4;
+        int ey = src_y + 4 > h_pad ? h_pad - src_y : 4;
+
+        float* dst_ptr = tmp_data + ti * 4;
+        const float* src_ptr = input_c4 + (src_y * w_pad + src_x) * 4;
+
+        if (ex == 4 && ey == 4) {
+          // trans input
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            input_trans_c4_4x4(
+                src_ci, 4, w_pad * 4, dst_ci, b_gi_stride, b_gi_stride * 4);
+          }
+        } else {
+          // trans remain input
+          int x_size = ex;
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            // pad
+            memset(trans_remain_tmp_data, 0, 64 * sizeof(float));
+            if (x_size > 0) {
+              for (int yi = 0; yi < ey; ++yi) {
+                float* dst_yi = trans_remain_tmp_data + yi * 16;
+                const float* src_yi = src_ci + w_pad * yi * 4;
+                memcpy(dst_yi, src_yi, x_size * sizeof(float) * 4);
+              }
+            }
+
+            // trans
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            input_trans_c4_4x4(trans_remain_tmp_data,
+                               4,
+                               16,
+                               dst_ci,
+                               b_gi_stride,
+                               b_gi_stride * 4);
+          }  // for ci_4
+        }
+      }
+      //*/
+      // input trans end
+      // *begin compute dot
+      // *
+      //*
+      float* dst_temp_data = tmp_data + tile_block * ic_4 * 64;
+      float* b_ptr = tmp_data;
+      int w_gi_stride = ic_4 * oc_4 * 16;
+      for (int gi = 0; gi < 16; ++gi) {
+        float* origin_C = dst_temp_data + gi * c_gi_stride;
+        float* origin_B = b_ptr + gi * b_gi_stride;
+        const float* origin_A = weight + gi * w_gi_stride;
+        sgemm_prepack_c4_small(
+            oc_4 * 4, tile_count, ic_4 * 4, origin_A, origin_B, origin_C, ctx);
+      }
+      //*/
+      //*
+      // output trans
+      float bias_value[4];
+      memset(bias_value, 0, 4 * sizeof(float));
+
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int dst_x = tw_index * 2;
+        int dst_y = th_index * 2;
+
+        int ex = dst_x + 2 > wout ? wout - dst_x : 2;
+        int ey = dst_y + 2 > hout ? hout - dst_y : 2;
+
+        float* dst_ptr = output + (dst_y * wout + dst_x) * 4;
+        float* src_ptr = dst_temp_data + ti * 4;
+
+        if (ex == 2) {
+          // trans output
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            output_trans_c4_post_2x4(src_ci,
+                                     c_gi_stride,
+                                     c_gi_stride * 4,
+                                     trans_remain_tmp_data,
+                                     4,
+                                     8,
+                                     bias_value,
+                                     param.fuse_relu);
+            write_to_output_c4_fp32(trans_remain_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr,
+                                    &act_param);
+          }
+        } else {
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+            // trans output
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            output_trans_c4_post_2x4(src_ci,
+                                     c_gi_stride,
+                                     c_gi_stride * 4,
+                                     trans_remain_tmp_data,
+                                     4,
+                                     8,
+                                     bias_value,
+                                     param.fuse_relu);
+            // copy to dest
+            memset(trans_tmp_data, 0, 16 * sizeof(float));
+            for (int i = 0; i < ey; ++i) {
+              memcpy(trans_tmp_data + i * ex * 4,
+                     trans_remain_tmp_data + i * 8,
+                     ex * sizeof(float) * 4);
+            }
+            write_to_output_c4_fp32(trans_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr,
+                                    &act_param);
+          }
+        }
+      }
+      //*/
+    }  // for block_count
+  }    // for num
+}  // conv_compute
+void conv_compute_2x2_3x3_small(const float* input,
+                                float* output,
+                                int num,
+                                int chout,
+                                int hout,
+                                int wout,
+                                int chin,
+                                int hin,
+                                int win,
+                                const float* weight,
+                                const float* bias,
+                                const operators::ConvParam& param,
+                                ARMContext* ctx) {
+  auto act_param = param.activation_param;
+  const int pad_h = (*param.paddings)[0];
+  const int pad_w = (*param.paddings)[2];
+  float* tmp_work_space =
+      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
+
+  int in_n_stride = chin * hin * win;
+  int out_n_stride = chout * hout * wout;
+  int ic_stride = win * hin;
+  int oc_stride = wout * hout;
+  int ic_4 = (chin + 3) / 4;
+  int oc_4 = (chout + 3) / 4;
+
+  int tile_w = (wout + 1) / 2;
+  int tile_h = (hout + 1) / 2;
+  int size_tile = tile_h * tile_w;
+
+  int w_pad = win + pad_w * 2;
+  int h_pad = hin + pad_h * 2;
+
+  const int zero_len = w_pad;
+  float zero_ptr[zero_len];  // NOLINT
+  memset(zero_ptr, 0, zero_len * sizeof(float));
+
+  float* input_c4 = tmp_work_space;
+  int new_h_stride = w_pad * 4;
+  int new_c_stride = new_h_stride * h_pad;
+
+  int ic_4_stride = w_pad * h_pad * 4;
+  int oc_4_stride = wout * hout * 4;
+
+  int tile_block = 8;
+  int block_count = (size_tile + tile_block - 1) / tile_block;
+
+  int threads = ctx->threads();
+  float* g_tmp_data = tmp_work_space + ic_4 * new_c_stride;
+  int tmp_data_thread_stride = tile_block * (oc_4 + ic_4) * 64;
+  memset(g_tmp_data, 0, tmp_data_thread_stride * sizeof(float));
+  float* g_trans_tmp_data = g_tmp_data + tmp_data_thread_stride;
+  float* g_trans_remain_tmp_data = g_trans_tmp_data + 64;
+
+  // begin compute
+  for (int ni = 0; ni < num; ++ni) {
+    // trans input to c4
+
+    for (int i = 0; i < ic_4; ++i) {
+      prepack_input_nxwc4_dw(input + ni * in_n_stride,
+                             input_c4 + i * new_c_stride,
+                             i * 4,
+                             -pad_h,
+                             hin + pad_h,
+                             -pad_w,
+                             win + pad_w,
+                             chin,
+                             win,
+                             hin,
+                             zero_ptr);
+    }
+    float* output_ptr = output + ni * out_n_stride;
+
+    const float* weight_ptr = weight;
+    const float* bias_ptr = bias;
+    for (int tbi = 0; tbi < block_count; ++tbi) {
+      float* tmp_data = g_tmp_data;
+      float* trans_tmp_data = g_trans_tmp_data;
+      float* trans_remain_tmp_data = g_trans_remain_tmp_data;
+      int tile_index = tbi * tile_block;
+      int tile_remain = size_tile - tile_index;
+      int tile_count = tile_remain > tile_block ? tile_block : tile_remain;
+
+      // input trans
+      int c_gi_stride = tile_count * oc_4 * 4;
+      int b_gi_stride = tile_count * ic_4 * 4;
+      //*
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int src_x = tw_index + tw_index;
+        int src_y = th_index + th_index;
+        int ex = src_x + 4 > w_pad ? w_pad - src_x : 4;
+        int ey = src_y + 4 > h_pad ? h_pad - src_y : 4;
+
+        float* dst_ptr = tmp_data + ti * 4;
+        const float* src_ptr = input_c4 + (src_y * w_pad + src_x) * 4;
+
+        if (ex == 4 && ey == 4) {
+          // trans input
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            input_trans_c4_4x4(
+                src_ci, 4, w_pad * 4, dst_ci, b_gi_stride, b_gi_stride * 4);
+          }
+        } else {
+          // trans remain input
+          int x_size = ex;
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            // pad
+            memset(trans_remain_tmp_data, 0, 64 * sizeof(float));
+            if (x_size > 0) {
+              for (int yi = 0; yi < ey; ++yi) {
+                float* dst_yi = trans_remain_tmp_data + yi * 16;
+                const float* src_yi = src_ci + w_pad * yi * 4;
+                memcpy(dst_yi, src_yi, x_size * sizeof(float) * 4);
+              }
+            }
+
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            input_trans_c4_4x4(trans_remain_tmp_data,
+                               4,
+                               16,
+                               dst_ci,
+                               b_gi_stride,
+                               b_gi_stride * 4);
+          }  // for ci_4
+        }
+      }
+      //*/
+      // input trans end
+      // *begin compute dot
+      // *
+      //*
+      float* dst_temp_data = tmp_data + tile_block * ic_4 * 64;
+      float* b_ptr = tmp_data;
+      int w_gi_stride = ic_4 * oc_4 * 16;
+#pragma omp parallel for num_threads(threads)
+      for (int gi = 0; gi < 16; ++gi) {
+        float* origin_C = dst_temp_data + gi * c_gi_stride;
+        float* origin_B = b_ptr + gi * b_gi_stride;
+        const float* origin_A = weight + gi * w_gi_stride;
+        sgemm_prepack_c4_small(
+            oc_4 * 4, tile_count, ic_4 * 4, origin_A, origin_B, origin_C, ctx);
+      }
+      //*/
+      //*
+      // output trans
+      float bias_value[4];
+      memset(bias_value, 0, 4 * sizeof(float));
+
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int dst_x = tw_index * 2;
+        int dst_y = th_index * 2;
+
+        int ex = dst_x + 2 > wout ? wout - dst_x : 2;
+        int ey = dst_y + 2 > hout ? hout - dst_y : 2;
+
+        float* dst_ptr = output + (dst_y * wout + dst_x) * 4;
+        float* src_ptr = dst_temp_data + ti * 4;
+
+        if (ex == 2) {
+          // trans output
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+
+            output_trans_c4_post_2x4(src_ci,
+                                     c_gi_stride,
+                                     c_gi_stride * 4,
+                                     trans_remain_tmp_data,
+                                     4,
+                                     8,
+                                     bias_value,
+                                     param.fuse_relu);
+            write_to_output_c4_fp32(trans_remain_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr,
+                                    &act_param);
+          }
+        } else {
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+            // trans output
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            output_trans_c4_post_2x4(src_ci,
+                                     c_gi_stride,
+                                     c_gi_stride * 4,
+                                     trans_remain_tmp_data,
+                                     4,
+                                     8,
+                                     bias_value,
+                                     param.fuse_relu);
+            // copy to dest
+            memset(trans_tmp_data, 0, 16 * sizeof(float));
+            for (int i = 0; i < ey; ++i) {
+              memcpy(trans_tmp_data + i * ex * 4,
+                     trans_remain_tmp_data + i * 8,
+                     ex * sizeof(float) * 4);
+            }
+            write_to_output_c4_fp32(trans_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr,
+                                    &act_param);
+          }
+        }
+      }
+      //*/
+    }  // for block_count
+  }    // for num
+}  // conv_compute
+void output_trans_c4_6x8(const float* src,
+                         int src_stride,
+                         float* dest,
+                         int dest_stride) {
+  const float32x4_t src0 = vld1q_f32(src);
+  const float32x4_t src1 = vld1q_f32(src + src_stride);
+  const float32x4_t src2 = vld1q_f32(src + src_stride * 2);
+  const float32x4_t src3 = vld1q_f32(src + src_stride * 3);
+  const float32x4_t src4 = vld1q_f32(src + src_stride * 4);
+  const float32x4_t src5 = vld1q_f32(src + src_stride * 5);
+  const float32x4_t src6 = vld1q_f32(src + src_stride * 6);
+  const float32x4_t src7 = vld1q_f32(src + src_stride * 7);
+
+  float32x4_t tmp024a = vaddq_f32(src1, src2);
+  float32x4_t tmp135a = vsubq_f32(src1, src2);
+  float32x4_t tmp024b = vaddq_f32(src3, src4);
+  float32x4_t tmp135b = vsubq_f32(src3, src4);
+  float32x4_t tmp024c = vaddq_f32(src5, src6);
+  float32x4_t tmp135c = vsubq_f32(src5, src6);
+
+  float32x4_t dest0 =
+      vaddq_f32(vaddq_f32(vaddq_f32(src0, tmp024a), tmp024b), tmp024c);
+  float32x4_t dest2 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 4)),
+                                vmulq_n_f32(tmp024c, 0.25f));
+  float32x4_t dest4 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 16)),
+                                vmulq_n_f32(tmp024c, 0.0625f));
+
+  float32x4_t dest1 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 2)),
+                                vmulq_n_f32(tmp135c, 0.5f));
+  float32x4_t dest3 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 8)),
+                                vmulq_n_f32(tmp135c, 0.125f));
+  float32x4_t dest5 =
+      vaddq_f32(src7,
+                vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 32)),
+                          vmulq_n_f32(tmp135c, 0.03125f)));
+
+  vst1q_f32(dest, dest0);
+  vst1q_f32(dest + dest_stride, dest1);
+  vst1q_f32(dest + dest_stride * 2, dest2);
+  vst1q_f32(dest + dest_stride * 3, dest3);
+  vst1q_f32(dest + dest_stride * 4, dest4);
+  vst1q_f32(dest + dest_stride * 5, dest5);
+}
+
+void output_trans_c4_post_6x8(const float* src,
+                              int src_stride,
+                              float* dest,
+                              int dest_stride,
+                              float* bias_value,
+                              bool has_relu = false) {
+  const float32x4_t src0 = vld1q_f32(src);
+  const float32x4_t src1 = vld1q_f32(src + src_stride);
+  const float32x4_t src2 = vld1q_f32(src + src_stride * 2);
+  const float32x4_t src3 = vld1q_f32(src + src_stride * 3);
+  const float32x4_t src4 = vld1q_f32(src + src_stride * 4);
+  const float32x4_t src5 = vld1q_f32(src + src_stride * 5);
+  const float32x4_t src6 = vld1q_f32(src + src_stride * 6);
+  const float32x4_t src7 = vld1q_f32(src + src_stride * 7);
+
+  float32x4_t tmp024a = vaddq_f32(src1, src2);
+  float32x4_t tmp135a = vsubq_f32(src1, src2);
+  float32x4_t tmp024b = vaddq_f32(src3, src4);
+  float32x4_t tmp135b = vsubq_f32(src3, src4);
+  float32x4_t tmp024c = vaddq_f32(src5, src6);
+  float32x4_t tmp135c = vsubq_f32(src5, src6);
+
+  float32x4_t dest0 =
+      vaddq_f32(vaddq_f32(vaddq_f32(src0, tmp024a), tmp024b), tmp024c);
+  float32x4_t dest2 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 4)),
+                                vmulq_n_f32(tmp024c, 0.25f));
+  float32x4_t dest4 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 16)),
+                                vmulq_n_f32(tmp024c, 0.0625f));
+
+  float32x4_t dest1 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 2)),
+                                vmulq_n_f32(tmp135c, 0.5f));
+  float32x4_t dest3 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 8)),
+                                vmulq_n_f32(tmp135c, 0.125f));
+  float32x4_t dest5 =
+      vaddq_f32(src7,
+                vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 32)),
+                          vmulq_n_f32(tmp135c, 0.03125f)));
+
+  if (bias_value) {
+    float32x4_t bias = vld1q_f32(bias_value);
+    dest0 = vaddq_f32(dest0, bias);
+    dest1 = vaddq_f32(dest1, bias);
+    dest2 = vaddq_f32(dest2, bias);
+    dest3 = vaddq_f32(dest3, bias);
+    dest4 = vaddq_f32(dest4, bias);
+    dest5 = vaddq_f32(dest5, bias);
+  }
+
+  if (has_relu) {
+    float32x4_t zeros = vdupq_n_f32(0);
+    dest0 = vmaxq_f32(dest0, zeros);
+    dest1 = vmaxq_f32(dest1, zeros);
+    dest2 = vmaxq_f32(dest2, zeros);
+    dest3 = vmaxq_f32(dest3, zeros);
+    dest4 = vmaxq_f32(dest4, zeros);
+    dest5 = vmaxq_f32(dest5, zeros);
+  }
+
+  vst1q_f32(dest, dest0);
+  vst1q_f32(dest + dest_stride, dest1);
+  vst1q_f32(dest + dest_stride * 2, dest2);
+  vst1q_f32(dest + dest_stride * 3, dest3);
+  vst1q_f32(dest + dest_stride * 4, dest4);
+  vst1q_f32(dest + dest_stride * 5, dest5);
+}
+
+void input_trans_c4_8x8(const float* src,
+                        int src_stride,
+                        float* dest,
+                        int dest_stride) {
+  float32x4_t src0 = vld1q_f32(src);
+  float32x4_t src1 = vld1q_f32(src + src_stride);
+  float32x4_t src2 = vld1q_f32(src + src_stride * 2);
+  float32x4_t src3 = vld1q_f32(src + src_stride * 3);
+  float32x4_t src4 = vld1q_f32(src + src_stride * 4);
+  float32x4_t src5 = vld1q_f32(src + src_stride * 5);
+  float32x4_t src6 = vld1q_f32(src + src_stride * 6);
+  float32x4_t src7 = vld1q_f32(src + src_stride * 7);
+
+  float32x4_t dst0 = vaddq_f32(vsubq_f32(src0, src6),
+                               vmulq_n_f32(vsubq_f32(src4, src2), 5.25));
+  float32x4_t dst7 = vaddq_f32(vsubq_f32(src7, src1),
+                               vmulq_n_f32(vsubq_f32(src3, src5), 5.25));
+
+  float32x4_t tmp12a =
+      vsubq_f32(vaddq_f32(src2, src6), vmulq_n_f32(src4, 4.25));
+  float32x4_t tmp12b =
+      vsubq_f32(vaddq_f32(src1, src5), vmulq_n_f32(src3, 4.25));
+  float32x4_t dst1 = vaddq_f32(tmp12a, tmp12b);
+  float32x4_t dst2 = vsubq_f32(tmp12a, tmp12b);
+
+  float32x4_t tmp34a = vsubq_f32(vaddq_f32(src6, vmulq_n_f32(src2, 0.25)),
+                                 vmulq_n_f32(src4, 1.25));
+  float32x4_t tmp34b =
+      vaddq_f32(vsubq_f32(vmulq_n_f32(src1, 0.5), vmulq_n_f32(src3, 2.5)),
+                vmulq_n_f32(src5, 2));
+  float32x4_t dst3 = vaddq_f32(tmp34a, tmp34b);
+  float32x4_t dst4 = vsubq_f32(tmp34a, tmp34b);
+
+  float32x4_t tmp56a =
+      vaddq_f32(src6, vmulq_n_f32(vsubq_f32(src2, vmulq_n_f32(src4, 1.25)), 4));
+  float32x4_t tmp56b =
+      vaddq_f32(vsubq_f32(vmulq_n_f32(src1, 2), vmulq_n_f32(src3, 2.5)),
+                vmulq_n_f32(src5, 0.5));
+  float32x4_t dst5 = vaddq_f32(tmp56a, tmp56b);
+  float32x4_t dst6 = vsubq_f32(tmp56a, tmp56b);
+
+  vst1q_f32(dest, dst0);
+  vst1q_f32(dest + dest_stride, dst1);
+  vst1q_f32(dest + dest_stride * 2, dst2);
+  vst1q_f32(dest + dest_stride * 3, dst3);
+  vst1q_f32(dest + dest_stride * 4, dst4);
+  vst1q_f32(dest + dest_stride * 5, dst5);
+  vst1q_f32(dest + dest_stride * 6, dst6);
+  vst1q_f32(dest + dest_stride * 7, dst7);
+}
+
+// BT=[1, 0, -1, 0,
+//    0, 1,  1, 0,
+//    0, -1, 1, 0,
+//    0, 1,  0, -1]
+void input_trans_c4_4x4(const float* src,
+                        int src_stride,
+                        int src_h_stride,
+                        float* dest,
+                        int dest_stride,
+                        int dest_h_stride) {
+  float32x4_t src00 = vld1q_f32(src);
+  float32x4_t src01 = vld1q_f32(src + src_stride);
+  float32x4_t src02 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src03 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src10 = vld1q_f32(src);
+  float32x4_t src11 = vld1q_f32(src + src_stride);
+  float32x4_t src12 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src13 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src20 = vld1q_f32(src);
+  float32x4_t src21 = vld1q_f32(src + src_stride);
+  float32x4_t src22 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src23 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src30 = vld1q_f32(src);
+  float32x4_t src31 = vld1q_f32(src + src_stride);
+  float32x4_t src32 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src33 = vld1q_f32(src + src_stride + src_stride + src_stride);
+
+  float32x4_t dst00 = vsubq_f32(src00, src02);
+  float32x4_t dst10 = vaddq_f32(src01, src02);
+  float32x4_t dst20 = vsubq_f32(src02, src01);
+  float32x4_t dst30 = vsubq_f32(src01, src03);
+
+  float32x4_t dst01 = vsubq_f32(src10, src12);
+  float32x4_t dst11 = vaddq_f32(src11, src12);
+  float32x4_t dst21 = vsubq_f32(src12, src11);
+  float32x4_t dst31 = vsubq_f32(src11, src13);
+
+  float32x4_t dst02 = vsubq_f32(src20, src22);
+  float32x4_t dst12 = vaddq_f32(src21, src22);
+  float32x4_t dst22 = vsubq_f32(src22, src21);
+  float32x4_t dst32 = vsubq_f32(src21, src23);
+
+  float32x4_t dst03 = vsubq_f32(src30, src32);
+  float32x4_t dst13 = vaddq_f32(src31, src32);
+  float32x4_t dst23 = vsubq_f32(src32, src31);
+  float32x4_t dst33 = vsubq_f32(src31, src33);
+
+  float32x4_t dest00 = vsubq_f32(dst00, dst02);
+  float32x4_t dest10 = vaddq_f32(dst01, dst02);
+  float32x4_t dest20 = vsubq_f32(dst02, dst01);
+  float32x4_t dest30 = vsubq_f32(dst01, dst03);
+
+  float32x4_t dest01 = vsubq_f32(dst10, dst12);
+  float32x4_t dest11 = vaddq_f32(dst11, dst12);
+  float32x4_t dest21 = vsubq_f32(dst12, dst11);
+  float32x4_t dest31 = vsubq_f32(dst11, dst13);
+
+  float32x4_t dest02 = vsubq_f32(dst20, dst22);
+  float32x4_t dest12 = vaddq_f32(dst21, dst22);
+  float32x4_t dest22 = vsubq_f32(dst22, dst21);
+  float32x4_t dest32 = vsubq_f32(dst21, dst23);
+
+  float32x4_t dest03 = vsubq_f32(dst30, dst32);
+  float32x4_t dest13 = vaddq_f32(dst31, dst32);
+  float32x4_t dest23 = vsubq_f32(dst32, dst31);
+  float32x4_t dest33 = vsubq_f32(dst31, dst33);
+
+  vst1q_f32(dest, dest00);
+  vst1q_f32(dest + dest_stride, dest10);
+  vst1q_f32(dest + dest_stride + dest_stride, dest20);
+  vst1q_f32(dest + dest_stride + dest_stride + dest_stride, dest30);
+  dest += dest_h_stride;
+  vst1q_f32(dest, dest01);
+  vst1q_f32(dest + dest_stride, dest11);
+  vst1q_f32(dest + dest_stride + dest_stride, dest21);
+  vst1q_f32(dest + dest_stride + dest_stride + dest_stride, dest31);
+  dest += dest_h_stride;
+  vst1q_f32(dest, dest02);
+  vst1q_f32(dest + dest_stride, dest12);
+  vst1q_f32(dest + dest_stride + dest_stride, dest22);
+  vst1q_f32(dest + dest_stride + dest_stride + dest_stride, dest32);
+  dest += dest_h_stride;
+  vst1q_f32(dest, dest03);
+  vst1q_f32(dest + dest_stride, dest13);
+  vst1q_f32(dest + dest_stride + dest_stride, dest23);
+  vst1q_f32(dest + dest_stride + dest_stride + dest_stride, dest33);
+}
+
+// AT=[1, 1,  1,  0,
+//    0, 1, -1, -1]
+void output_trans_c4_post_2x4(const float* src,
+                              int src_stride,
+                              int src_h_stride,
+                              float* dest,
+                              int dest_stride,
+                              int dest_h_stride,
+                              float* bias_value,
+                              bool has_relu) {
+  float32x4_t src00 = vld1q_f32(src);
+  float32x4_t src01 = vld1q_f32(src + src_stride);
+  float32x4_t src02 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src03 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src10 = vld1q_f32(src);
+  float32x4_t src11 = vld1q_f32(src + src_stride);
+  float32x4_t src12 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src13 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src20 = vld1q_f32(src);
+  float32x4_t src21 = vld1q_f32(src + src_stride);
+  float32x4_t src22 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src23 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src30 = vld1q_f32(src);
+  float32x4_t src31 = vld1q_f32(src + src_stride);
+  float32x4_t src32 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src33 = vld1q_f32(src + src_stride + src_stride + src_stride);
+
+  float32x4_t dst00 = vaddq_f32(vaddq_f32(src00, src01), src02);
+  float32x4_t dst10 = vsubq_f32(vsubq_f32(src01, src02), src03);
+  float32x4_t dst01 = vaddq_f32(vaddq_f32(src10, src11), src12);
+  float32x4_t dst11 = vsubq_f32(vsubq_f32(src11, src12), src13);
+  float32x4_t dst02 = vaddq_f32(vaddq_f32(src20, src21), src22);
+  float32x4_t dst12 = vsubq_f32(vsubq_f32(src21, src22), src23);
+  float32x4_t dst03 = vaddq_f32(vaddq_f32(src30, src31), src32);
+  float32x4_t dst13 = vsubq_f32(vsubq_f32(src31, src32), src33);
+
+  float32x4_t dest00 = vaddq_f32(vaddq_f32(dst00, dst01), dst02);
+  float32x4_t dest10 = vsubq_f32(vsubq_f32(dst01, dst02), dst03);
+  float32x4_t dest01 = vaddq_f32(vaddq_f32(dst10, dst11), dst12);
+  float32x4_t dest11 = vsubq_f32(vsubq_f32(dst11, dst12), dst13);
+
+  if (bias_value) {
+    float32x4_t bias = vld1q_f32(bias_value);
+    dest00 = vaddq_f32(dest00, bias);
+    dest10 = vaddq_f32(dest10, bias);
+    dest01 = vaddq_f32(dest01, bias);
+    dest11 = vaddq_f32(dest11, bias);
+  }
+
+  if (has_relu) {
+    float32x4_t zeros = vdupq_n_f32(0);
+    dest00 = vmaxq_f32(dest00, zeros);
+    dest10 = vmaxq_f32(dest10, zeros);
+    dest01 = vmaxq_f32(dest01, zeros);
+    dest11 = vmaxq_f32(dest11, zeros);
+  }
+
+  vst1q_f32(dest, dest00);
+  vst1q_f32(dest + dest_stride, dest10);
+  dest += dest_h_stride;
+  vst1q_f32(dest, dest01);
+  vst1q_f32(dest + dest_stride, dest11);
+}
+void weight_trans_c4_8x8(
+    float* dest, const float* din, int ch_in, int ch_out, void* workspace) {
+  const float coeff[8][3] = {{1.0f, 0.0f, 0.0f},
+                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                             {32.0f / 45, 16.0f / 45, 8.0f / 45},
+                             {32.0f / 45, -16.0f / 45, 8.0f / 45},
+                             {0.0f, 0.0f, 1.0f}};
+
+  float* ptr_out = static_cast<float*>(workspace);
+
+  for (int i = 0; i < ch_out; i++) {
+    for (int j = 0; j < ch_in; j++) {
+      const float* kernel0 =
+          static_cast<const float*>(din) + (i * ch_in + j) * 9;
+      float* ptr_channel = ptr_out + (i * ch_in + j) * 64;
+
+      //! transform kernel, transposed
+      const float* k0 = kernel0;
+      const float* k1 = kernel0 + 3;
+      const float* k2 = kernel0 + 6;
+
+      //! h
+      float tmp[8][3];
+      for (int i = 0; i < 8; i++) {
+        tmp[i][0] =
+            k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2];
+        tmp[i][1] =
+            k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2];
+        tmp[i][2] =
+            k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2];
+      }
+
+      //! v
+      for (int j = 0; j < 8; j++) {
+        float* tmpp = &tmp[j][0];
+        for (int i = 0; i < 8; i++) {
+          ptr_channel[j * 8 + i] = tmpp[0] * coeff[i][0] +
+                                   tmpp[1] * coeff[i][1] +
+                                   tmpp[2] * coeff[i][2];
+        }
+      }
+    }
+  }
+
+  int oc_pad = (ch_out + 3) / 4 * 4;
+  int ic_pad = (ch_in + 3) / 4 * 4;
+  int c_stride = ic_pad * oc_pad;
+  for (int i = 0; i < ch_out * ch_in * 64; ++i) {
+    int new_c = i % 64;
+    int new_oc = i / ch_in / 64 / 4;
+    int new_ic = i / 64 % (ch_in * 4) % ch_in;
+    int new_inner = i / ch_in / 64 % 4;
+    int dest_ind =
+        new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner;
+    dest[dest_ind] = ptr_out[i];
+  }
+}
+
+void weight_trans_c4_4x4(
+    float* dest, const float* din, int ch_in, int ch_out, void* workspace) {
+  const float coeff[4][3] = {{1.0f, 0.0f, 0.0f},
+                             {0.5f, 0.5f, 0.5f},
+                             {0.5f, -0.5f, 0.5f},
+                             {0.0f, 0.0f, 1.0f}};
+
+  float* ptr_out = static_cast<float*>(workspace);
+
+  for (int i = 0; i < ch_out; i++) {
+    for (int j = 0; j < ch_in; j++) {
+      const float* kernel0 =
+          static_cast<const float*>(din) + (i * ch_in + j) * 9;
+      float* ptr_channel = ptr_out + (i * ch_in + j) * 16;
+
+      //! transform kernel, transposed
+      const float* k0 = kernel0;
+      const float* k1 = kernel0 + 3;
+      const float* k2 = kernel0 + 6;
+
+      //! h
+      float tmp[4][3];
+      for (int i = 0; i < 4; i++) {
+        tmp[i][0] =
+            k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2];
+        tmp[i][1] =
+            k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2];
+        tmp[i][2] =
+            k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2];
+      }
+
+      //! v
+      for (int j = 0; j < 4; j++) {
+        float* tmpp = &tmp[j][0];
+        for (int i = 0; i < 4; i++) {
+          ptr_channel[j * 4 + i] = tmpp[0] * coeff[i][0] +
+                                   tmpp[1] * coeff[i][1] +
+                                   tmpp[2] * coeff[i][2];
+        }
+      }
+    }
+  }
+
+  int oc_pad = (ch_out + 3) / 4 * 4;
+  int ic_pad = (ch_in + 3) / 4 * 4;
+  int c_stride = ic_pad * oc_pad;
+  for (int i = 0; i < ch_out * ch_in * 16; ++i) {
+    int new_c = i % 16;
+    int new_oc = i / ch_in / 16 / 4;
+    int new_ic = i / 16 % (ch_in * 4) % ch_in;
+    int new_inner = i / ch_in / 16 % 4;
+    int dest_ind =
+        new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner;
+    dest[dest_ind] = ptr_out[i];
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc
deleted file mode 100644
index 99aeea8bdea2a50795dcdca18464a196ee877291..0000000000000000000000000000000000000000
--- a/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc
+++ /dev/null
@@ -1,538 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <arm_neon.h>
-#include "lite/backends/arm/math/conv_block_utils.h"
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/operators/op_params.h"
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-void conv_3x3s1_depthwise_fp32(const float* i_data,
-                               float* o_data,
-                               int bs,
-                               int oc,
-                               int oh,
-                               int ow,
-                               int ic,
-                               int ih,
-                               int win,
-                               const float* weights,
-                               const float* bias,
-                               const operators::ConvParam& param,
-                               ARMContext* ctx) {
-  int threads = ctx->threads();
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
-  const int out_c_block = 4;
-  const int out_h_kernel = 2;
-  const int out_w_kernel = 4;
-  const int win_ext = ow + 2;
-  const int ow_round = ROUNDUP(ow, 4);
-  const int win_round = ROUNDUP(win_ext, 4);
-  const int hin_round = oh + 2;
-  const int prein_size = win_round * hin_round * out_c_block;
-  auto workspace_size =
-      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
-  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
-
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-
-  /// get workspace
-  float* ptr_zero = ctx->workspace_data<float>();
-  memset(ptr_zero, 0, sizeof(float) * win_round);
-  float* ptr_write = ptr_zero + win_round;
-
-  int size_in_channel = win * ih;
-  int size_out_channel = ow * oh;
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int hs = -pad_h;
-  int he = hs + hin_round;
-  int w_loop = ow_round / 4;
-  auto remain = w_loop * 4 - ow;
-  bool flag_remain = remain > 0;
-  remain = 4 - remain;
-  remain = remain > 0 ? remain : 0;
-  int row_len = win_round * out_c_block;
-
-  for (int n = 0; n < bs; ++n) {
-    const float* din_batch = i_data + n * ic * size_in_channel;
-    float* dout_batch = o_data + n * oc * size_out_channel;
-#pragma omp parallel for num_threads(threads)
-    for (int c = 0; c < oc; c += out_c_block) {
-#ifdef ARM_WITH_OMP
-      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
-#else
-      float* pre_din = ptr_write + ow_round;
-#endif
-      /// const array size
-      float pre_out[out_c_block * out_w_kernel * out_h_kernel];  // NOLINT
-      prepack_input_nxwc4_dw(
-          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
-      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
-      float* dout_c00 = dout_batch + c * size_out_channel;
-      float bias_local[4] = {0, 0, 0, 0};
-      if (flag_bias) {
-        bias_local[0] = bias[c];
-        bias_local[1] = bias[c + 1];
-        bias_local[2] = bias[c + 2];
-        bias_local[3] = bias[c + 3];
-      }
-      float32x4_t vbias = vld1q_f32(bias_local);
-#ifdef __aarch64__
-      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
-      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
-      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
-      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
-      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
-      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
-      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
-      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
-      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
-#endif
-      for (int h = 0; h < oh; h += out_h_kernel) {
-        float* outc00 = dout_c00 + h * ow;
-        float* outc01 = outc00 + ow;
-        float* outc10 = outc00 + size_out_channel;
-        float* outc11 = outc10 + ow;
-        float* outc20 = outc10 + size_out_channel;
-        float* outc21 = outc20 + ow;
-        float* outc30 = outc20 + size_out_channel;
-        float* outc31 = outc30 + ow;
-        const float* inr0 = pre_din + h * row_len;
-        const float* inr1 = inr0 + row_len;
-        const float* inr2 = inr1 + row_len;
-        const float* inr3 = inr2 + row_len;
-        if (c + out_c_block > oc) {
-          switch (c + out_c_block - oc) {
-            case 3:
-              outc10 = ptr_write;
-              outc11 = ptr_write;
-            case 2:
-              outc20 = ptr_write;
-              outc21 = ptr_write;
-            case 1:
-              outc30 = ptr_write;
-              outc31 = ptr_write;
-            default:
-              break;
-          }
-        }
-        if (h + out_h_kernel > oh) {
-          outc01 = ptr_write;
-          outc11 = ptr_write;
-          outc21 = ptr_write;
-          outc31 = ptr_write;
-        }
-        float* outl[] = {outc00,
-                         outc10,
-                         outc20,
-                         outc30,
-                         outc01,
-                         outc11,
-                         outc21,
-                         outc31,
-                         reinterpret_cast<float*>(bias_local),
-                         reinterpret_cast<float*>(flag_relu)};
-        void* outl_ptr = reinterpret_cast<void*>(outl);
-        for (int w = 0; w < w_loop; ++w) {
-          bool flag_mask = (w == w_loop - 1) && flag_remain;
-          float* out0 = pre_out;
-// clang-format off
-#ifdef __aarch64__
-          asm volatile(
-          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
-          "ldp    q6, q7,   [%[inr1]], #32\n" /* load input r1*/
-          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
-          "ldp    q8, q9,   [%[inr1]], #32\n" /* load input r1*/
-          "ldp    q4, q5,   [%[inr0]]\n"      /* load input r0*/
-          "ldp    q10, q11, [%[inr1]]\n"      /* load input r1*/
-          /*  r0, r1, mul w0, get out r0, r1 */
-          "fmul   v15.4s ,  %[w0].4s,  v0.4s\n" /* outr00 = w0 * r0, 0*/
-          "fmul   v16.4s ,  %[w0].4s,  v1.4s\n" /* outr01 = w0 * r0, 1*/
-          "fmul   v17.4s ,  %[w0].4s,  v2.4s\n" /* outr02 = w0 * r0, 2*/
-          "fmul   v18.4s ,  %[w0].4s,  v3.4s\n" /* outr03 = w0 * r0, 3*/
-          "fmul   v19.4s ,  %[w0].4s,  v6.4s\n" /* outr10 = w0 * r1, 0*/
-          "fmul   v20.4s ,  %[w0].4s,  v7.4s\n" /* outr11 = w0 * r1, 1*/
-          "fmul   v21.4s ,  %[w0].4s,  v8.4s\n" /* outr12 = w0 * r1, 2*/
-          "fmul   v22.4s ,  %[w0].4s,  v9.4s\n" /* outr13 = w0 * r1, 3*/
-          /*  r0, r1, mul w1, get out r0, r1 */
-          "fmla   v15.4s ,  %[w1].4s,  v1.4s\n" /* outr00 = w1 * r0[1]*/
-          "ldp    q0, q1,   [%[inr2]], #32\n"     /* load input r2*/
-          "fmla   v16.4s ,  %[w1].4s,  v2.4s\n" /* outr01 = w1 * r0[2]*/
-          "fmla   v17.4s ,  %[w1].4s,  v3.4s\n" /* outr02 = w1 * r0[3]*/
-          "fmla   v18.4s ,  %[w1].4s,  v4.4s\n" /* outr03 = w1 * r0[4]*/
-          "fmla   v19.4s ,  %[w1].4s,  v7.4s\n" /* outr10 = w1 * r1[1]*/
-          "fmla   v20.4s ,  %[w1].4s,  v8.4s\n" /* outr11 = w1 * r1[2]*/
-          "fmla   v21.4s ,  %[w1].4s,  v9.4s\n" /* outr12 = w1 * r1[3]*/
-          "fmla   v22.4s ,  %[w1].4s,  v10.4s\n"/* outr13 = w1 * r1[4]*/
-          /*  r0, r1, mul w2, get out r0, r1 */
-          "fmla   v15.4s ,  %[w2].4s,  v2.4s\n" /* outr00 = w2 * r0[2]*/
-          "fmla   v16.4s ,  %[w2].4s,  v3.4s\n" /* outr01 = w2 * r0[3]*/
-          "ldp    q2, q3,   [%[inr2]], #32\n"     /* load input r2*/
-          "fmla   v17.4s ,  %[w2].4s,  v4.4s\n" /* outr02 = w2 * r0[4]*/
-          "fmla   v18.4s ,  %[w2].4s,  v5.4s\n" /* outr03 = w2 * r0[5]*/
-          "ldp    q4, q5,   [%[inr2]]\n"          /* load input r2*/
-          "fmla   v19.4s ,  %[w2].4s,  v8.4s\n" /* outr10 = w2 * r1[2]*/
-          "fmla   v20.4s ,  %[w2].4s,  v9.4s\n" /* outr11 = w2 * r1[3]*/
-          "fmla   v21.4s ,  %[w2].4s,  v10.4s\n"/* outr12 = w2 * r1[4]*/
-          "fmla   v22.4s ,  %[w2].4s,  v11.4s\n"/* outr13 = w2 * r1[5]*/
-          /*  r1, r2, mul w3, get out r0, r1 */
-          "fmla   v15.4s ,  %[w3].4s,  v6.4s\n" /* outr00 = w3 * r1[0]*/
-          "fmla   v16.4s ,  %[w3].4s,  v7.4s\n" /* outr01 = w3 * r1[1]*/
-          "fmla   v17.4s ,  %[w3].4s,  v8.4s\n" /* outr02 = w3 * r1[2]*/
-          "fmla   v18.4s ,  %[w3].4s,  v9.4s\n" /* outr03 = w3 * r1[3]*/
-          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr10 = w3 * r2[0]*/
-          "fmla   v20.4s ,  %[w3].4s,  v1.4s\n" /* outr11 = w3 * r2[1]*/
-          "fmla   v21.4s ,  %[w3].4s,  v2.4s\n" /* outr12 = w3 * r2[2]*/
-          "fmla   v22.4s ,  %[w3].4s,  v3.4s\n" /* outr13 = w3 * r2[3]*/
-          /*  r1, r2, mul w4, get out r0, r1 */
-          "fmla   v15.4s ,  %[w4].4s,  v7.4s\n" /* outr00 = w4 * r1[1]*/
-          "ldp    q6, q7,   [%[inr3]], #32\n"     /* load input r3*/
-          "fmla   v16.4s ,  %[w4].4s,  v8.4s\n" /* outr01 = w4 * r1[2]*/
-          "fmla   v17.4s ,  %[w4].4s,  v9.4s\n" /* outr02 = w4 * r1[3]*/
-          "fmla   v18.4s ,  %[w4].4s,  v10.4s\n"/* outr03 = w4 * r1[4]*/
-          "ldp    x0, x1, [%[outl]]  \n"
-          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr10 = w4 * r2[1]*/
-          "fmla   v20.4s ,  %[w4].4s,  v2.4s\n" /* outr11 = w4 * r2[2]*/
-          "fmla   v21.4s ,  %[w4].4s,  v3.4s\n" /* outr12 = w4 * r2[3]*/
-          "fmla   v22.4s ,  %[w4].4s,  v4.4s\n" /* outr13 = w4 * r2[4]*/
-          /*  r1, r2, mul w5, get out r0, r1 */
-          "fmla   v15.4s ,  %[w5].4s,  v8.4s\n" /* outr00 = w5 * r1[2]*/
-          "fmla   v16.4s ,  %[w5].4s,  v9.4s\n" /* outr01 = w5 * r1[3]*/
-          "ldp    q8, q9,   [%[inr3]], #32\n"     /* load input r3*/
-          "fmla   v17.4s ,  %[w5].4s,  v10.4s\n"/* outr02 = w5 * r1[4]*/
-          "fmla   v18.4s ,  %[w5].4s,  v11.4s\n"/* outr03 = w5 * r1[5]*/
-          "ldp    q10, q11,   [%[inr3]]\n"        /* load input r3*/
-          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr10 = w5 * r2[2]*/
-          "fmla   v20.4s ,  %[w5].4s,  v3.4s\n" /* outr11 = w5 * r2[3]*/
-          "fmla   v21.4s ,  %[w5].4s,  v4.4s\n" /* outr12 = w5 * r2[4]*/
-          "fmla   v22.4s ,  %[w5].4s,  v5.4s\n" /* outr13 = w5 * r2[5]*/
-          /*  r2, r3, mul w6, get out r0, r1 */
-          "fmla   v15.4s ,  %[w6].4s,  v0.4s\n" /* outr00 = w6 * r2[0]*/
-          "fmla   v16.4s ,  %[w6].4s,  v1.4s\n" /* outr01 = w6 * r2[1]*/
-          "fmla   v17.4s ,  %[w6].4s,  v2.4s\n" /* outr02 = w6 * r2[2]*/
-          "fmla   v18.4s ,  %[w6].4s,  v3.4s\n" /* outr03 = w6 * r2[3]*/
-          "ldp    x2, x3, [%[outl], #16]  \n"
-          "fmla   v19.4s ,  %[w6].4s,  v6.4s\n" /* outr10 = w6 * r3[0]*/
-          "fmla   v20.4s ,  %[w6].4s,  v7.4s\n" /* outr11 = w6 * r3[1]*/
-          "fmla   v21.4s ,  %[w6].4s,  v8.4s\n" /* outr12 = w6 * r3[2]*/
-          "fmla   v22.4s ,  %[w6].4s,  v9.4s\n" /* outr13 = w6 * r3[3]*/
-          /*  r2, r3, mul w7, get out r0, r1 */
-          "fmla   v15.4s ,  %[w7].4s,  v1.4s\n" /* outr00 = w7 * r2[1]*/
-          "fmla   v16.4s ,  %[w7].4s,  v2.4s\n" /* outr01 = w7 * r2[2]*/
-          "fmla   v17.4s ,  %[w7].4s,  v3.4s\n" /* outr02 = w7 * r2[3]*/
-          "fmla   v18.4s ,  %[w7].4s,  v4.4s\n" /* outr03 = w7 * r2[4]*/
-          "ldp    x4, x5, [%[outl], #32]  \n"
-          "fmla   v19.4s ,  %[w7].4s,  v7.4s\n" /* outr10 = w7 * r3[1]*/
-          "fmla   v20.4s ,  %[w7].4s,  v8.4s\n" /* outr11 = w7 * r3[2]*/
-          "fmla   v21.4s ,  %[w7].4s,  v9.4s\n" /* outr12 = w7 * r3[3]*/
-          "fmla   v22.4s ,  %[w7].4s,  v10.4s\n"/* outr13 = w7 * r3[4]*/
-          /*  r2, r3, mul w8, get out r0, r1 */
-          "fmla   v15.4s ,  %[w8].4s,  v2.4s\n" /* outr00 = w8 * r2[2]*/
-          "fmla   v16.4s ,  %[w8].4s,  v3.4s\n" /* outr01 = w8 * r2[3]*/
-          "fmla   v17.4s ,  %[w8].4s,  v4.4s\n" /* outr02 = w8 * r2[0]*/
-          "fmla   v18.4s ,  %[w8].4s,  v5.4s\n" /* outr03 = w8 * r2[1]*/
-          "ldp    x6, x7, [%[outl], #48]  \n"
-          "fmla   v19.4s ,  %[w8].4s,  v8.4s\n" /* outr10 = w8 * r3[2]*/
-          "fmla   v20.4s ,  %[w8].4s,  v9.4s\n" /* outr11 = w8 * r3[3]*/
-          "fmla   v21.4s ,  %[w8].4s,  v10.4s\n"/* outr12 = w8 * r3[0]*/
-          "fmla   v22.4s ,  %[w8].4s,  v11.4s\n"/* outr13 = w8 * r3[1]*/
-
-          "fadd   v15.4s, v15.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v16.4s, v16.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v17.4s, v17.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v18.4s, v18.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v19.4s, v19.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v20.4s, v20.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v21.4s, v21.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v22.4s, v22.4s, %[vbias].4s\n"/* add bias */
-
-          /* transpose */
-          "trn1   v0.4s, v15.4s, v16.4s\n" /* r0: a0a1c0c1*/
-          "trn2   v1.4s, v15.4s, v16.4s\n" /* r0: b0b1d0d1*/
-          "trn1   v2.4s, v17.4s, v18.4s\n" /* r0: a2a3c2c3*/
-          "trn2   v3.4s, v17.4s, v18.4s\n" /* r0: b2b3d2d3*/
-          "trn1   v4.4s, v19.4s, v20.4s\n" /* r1: a0a1c0c1*/
-          "trn2   v5.4s, v19.4s, v20.4s\n" /* r1: b0b1d0d1*/
-          "trn1   v6.4s, v21.4s, v22.4s\n" /* r1: a2a3c2c3*/
-          "trn2   v7.4s, v21.4s, v22.4s\n" /* r1: b2b3d2d3*/
-          "trn1   v15.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
-          "trn2   v19.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
-          "trn1   v17.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
-          "trn2   v21.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
-          "trn1   v16.2d, v4.2d, v6.2d\n"  /* r1: a0a1a2a3*/
-          "trn2   v20.2d, v4.2d, v6.2d\n"  /* r1: c0c1c2c3*/
-          "trn1   v18.2d, v5.2d, v7.2d\n"  /* r1: b0b1b2b3*/
-          "trn2   v22.2d, v5.2d, v7.2d\n"  /* r1: d0d1d2d3*/
-
-          "cbz    %w[flag_relu],  0f\n"    /* skip relu*/
-          "movi   v0.4s, #0\n"             /* for relu */
-          "fmax   v15.4s, v15.4s, v0.4s\n"
-          "fmax   v16.4s, v16.4s, v0.4s\n"
-          "fmax   v17.4s, v17.4s, v0.4s\n"
-          "fmax   v18.4s, v18.4s, v0.4s\n"
-          "fmax   v19.4s, v19.4s, v0.4s\n"
-          "fmax   v20.4s, v20.4s, v0.4s\n"
-          "fmax   v21.4s, v21.4s, v0.4s\n"
-          "fmax   v22.4s, v22.4s, v0.4s\n"
-          "0:\n"
-          "cbnz   %w[flag_mask], 1f\n"
-          "str    q15, [x0]\n" /* save outc00 */
-          "str    q16, [x4]\n" /* save outc01 */
-          "str    q17, [x1]\n" /* save outc10 */
-          "str    q18, [x5]\n" /* save outc11 */
-          "str    q19, [x2]\n" /* save outc20 */
-          "str    q20, [x6]\n" /* save outc21 */
-          "str    q21, [x3]\n" /* save outc30 */
-          "str    q22, [x7]\n" /* save outc31 */
-          "b 2f\n"
-          "1:\n"
-          "str  q15, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q17, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q19, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q21, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q16, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q18, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q20, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q22, [%[out]], #16 \n" /* save remain to pre_out */
-          "2:\n"
-          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
-           [inr2] "+r"(inr2), [inr3] "+r"(inr3),
-           [out]"+r"(out0)
-          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
-           [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
-           [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
-           [vbias]"w" (vbias), [outl] "r" (outl_ptr),
-           [flag_mask] "r" (flag_mask), [flag_relu] "r" (flag_relu)
-          : "cc", "memory",
-            "v0","v1","v2","v3","v4","v5","v6","v7",
-            "v8", "v9", "v10", "v11", "v15",
-            "v16","v17","v18","v19","v20","v21","v22",
-            "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7"
-          );
-#else
-          asm volatile(
-          /* load weights */
-          "vld1.32    {d10-d13}, [%[wc0]]!      @ load w0, w1, to q5, q6\n"
-          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w2, to q7\n"
-          /* load r0, r1 */
-          "vld1.32    {d0-d3}, [%[r0]]!         @ load r0, q0, q1\n"
-          "vld1.32    {d4-d7}, [%[r0]]!         @ load r0, q2, q3\n"
-          /* main loop */
-          "0:                                   @ main loop\n"
-          /* mul r0 with w0, w1, w2, get out r0 */
-          "vmul.f32   q8, q5, q0                @ w0 * inr00\n"
-          "vmul.f32   q9, q5, q1                @ w0 * inr01\n"
-          "vmul.f32   q10, q5, q2               @ w0 * inr02\n"
-          "vmul.f32   q11, q5, q3               @ w0 * inr03\n"
-          "vmla.f32   q8, q6, q1                @ w1 * inr01\n"
-          "vld1.32    {d0-d3}, [%[r0]]          @ load r0, q0, q1\n"
-          "vmla.f32   q9, q6, q2                @ w1 * inr02\n"
-          "vmla.f32   q10, q6, q3               @ w1 * inr03\n"
-          "vmla.f32   q11, q6, q0               @ w1 * inr04\n"
-          "vmla.f32   q8, q7, q2                @ w2 * inr02\n"
-          "vmla.f32   q9, q7, q3                @ w2 * inr03\n"
-          "vld1.32    {d4-d7}, [%[r1]]!         @ load r0, q2, q3\n"
-          "vmla.f32   q10, q7, q0               @ w2 * inr04\n"
-          "vmla.f32   q11, q7, q1               @ w2 * inr05\n"
-          "vld1.32    {d0-d3}, [%[r1]]!         @ load r0, q0, q1\n"
-          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w3 to q4\n"
-          /* mul r1 with w0-w5, get out r0, r1 */
-          "vmul.f32   q12, q5, q2               @ w0 * inr10\n"
-          "vmul.f32   q13, q5, q3               @ w0 * inr11\n"
-          "vmul.f32   q14, q5, q0               @ w0 * inr12\n"
-          "vmul.f32   q15, q5, q1               @ w0 * inr13\n"
-          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w4 to q5\n"
-          "vmla.f32   q8, q4, q2                @ w3 * inr10\n"
-          "vmla.f32   q9, q4, q3                @ w3 * inr11\n"
-          "vmla.f32   q10, q4, q0               @ w3 * inr12\n"
-          "vmla.f32   q11, q4, q1               @ w3 * inr13\n"
-          /* mul r1 with w1, w4, get out r1, r0 */
-          "vmla.f32   q8, q5, q3                @ w4 * inr11\n"
-          "vmla.f32   q12, q6, q3               @ w1 * inr11\n"
-          "vld1.32    {d4-d7}, [%[r1]]          @ load r1, q2, q3\n"
-          "vmla.f32   q9, q5, q0                @ w4 * inr12\n"
-          "vmla.f32   q13, q6, q0               @ w1 * inr12\n"
-          "vmla.f32   q10, q5, q1               @ w4 * inr13\n"
-          "vmla.f32   q14, q6, q1               @ w1 * inr13\n"
-          "vmla.f32   q11, q5, q2               @ w4 * inr14\n"
-          "vmla.f32   q15, q6, q2               @ w1 * inr14\n"
-          "vld1.32    {d12-d13}, [%[wc0]]!      @ load w5 to q6\n"
-          /* mul r1 with w2, w5, get out r1, r0 */
-          "vmla.f32   q12, q7, q0               @ w2 * inr12\n"
-          "vmla.f32   q13, q7, q1               @ w2 * inr13\n"
-          "vmla.f32   q8, q6, q0                @ w5 * inr12\n"
-          "vmla.f32   q9, q6, q1                @ w5 * inr13\n"
-          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, q0, q1\n"
-          "vmla.f32   q14, q7, q2               @ w2 * inr14\n"
-          "vmla.f32   q15, q7, q3               @ w2 * inr15\n"
-          "vmla.f32   q10, q6, q2               @ w5 * inr14\n"
-          "vmla.f32   q11, q6, q3               @ w5 * inr15\n"
-          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, q0, q1\n"
-          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w6, to q7\n"
-          /* mul r2 with w3-w8, get out r0, r1 */
-          "vmla.f32   q12, q4, q0               @ w3 * inr20\n"
-          "vmla.f32   q13, q4, q1               @ w3 * inr21\n"
-          "vmla.f32   q14, q4, q2               @ w3 * inr22\n"
-          "vmla.f32   q15, q4, q3               @ w3 * inr23\n"
-          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w7, to q4\n"
-          "vmla.f32   q8,  q7, q0               @ w6 * inr20\n"
-          "vmla.f32   q9,  q7, q1               @ w6 * inr21\n"
-          "vmla.f32   q10, q7, q2               @ w6 * inr22\n"
-          "vmla.f32   q11, q7, q3               @ w6 * inr23\n"
-          /* mul r2 with w4, w7, get out r1, r0 */
-          "vmla.f32   q8,  q4, q1               @ w7 * inr21\n"
-          "vmla.f32   q12, q5, q1               @ w4 * inr21\n"
-          "vld1.32    {d0-d3}, [%[r2]]          @ load r2, q0, q1\n"
-          "vmla.f32   q9,  q4, q2               @ w7 * inr22\n"
-          "vmla.f32   q13, q5, q2               @ w4 * inr22\n"
-          "vmla.f32   q10, q4, q3               @ w7 * inr23\n"
-          "vmla.f32   q14, q5, q3               @ w4 * inr23\n"
-          "vmla.f32   q11, q4, q0               @ w7 * inr24\n"
-          "vmla.f32   q15, q5, q0               @ w4 * inr24\n"
-          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w8 to q5\n"
-          /* mul r1 with w5, w8, get out r1, r0 */
-          "vmla.f32   q12, q6, q2               @ w5 * inr22\n"
-          "vmla.f32   q13, q6, q3               @ w5 * inr23\n"
-          "vmla.f32   q8,  q5, q2               @ w8 * inr22\n"
-          "vmla.f32   q9,  q5, q3               @ w8 * inr23\n"
-          "vld1.32    {d4-d7}, [%[r3]]!         @ load r3, q2, q3\n"
-          "ldr r4,    [%[outl], #32]            @ load bias addr to r4\n"
-          "vmla.f32   q14, q6, q0               @ w5 * inr24\n"
-          "vmla.f32   q15, q6, q1               @ w5 * inr25\n"
-          "vmla.f32   q10, q5, q0               @ w8 * inr24\n"
-          "vmla.f32   q11, q5, q1               @ w8 * inr25\n"
-          "vld1.32    {d0-d3}, [%[r3]]!         @ load r3, q0, q1\n"
-          "sub %[wc0], %[wc0], #144      @ wc0 - 144 to start address\n"
-          /* mul r3 with w6, w7, w8, get out r1 */
-          "vmla.f32   q12, q7, q2               @ w6 * inr30\n"
-          "vmla.f32   q13, q7, q3               @ w6 * inr31\n"
-          "vmla.f32   q14, q7, q0               @ w6 * inr32\n"
-          "vmla.f32   q15, q7, q1               @ w6 * inr33\n"
-          "vmla.f32   q12, q4, q3               @ w7 * inr31\n"
-          "vld1.32    {d4-d7}, [%[r3]]          @ load r3, q2, q3\n"
-          "vld1.32    {d12-d13}, [r4]           @ load bias\n"
-          "vmla.f32   q13, q4, q0               @ w7 * inr32\n"
-          "vmla.f32   q14, q4, q1               @ w7 * inr33\n"
-          "vmla.f32   q15, q4, q2               @ w7 * inr34\n"
-          "ldr r0,    [%[outl]]                 @ load outc00 to r0\n"
-          "vmla.f32   q12, q5, q0               @ w8 * inr32\n"
-          "vmla.f32   q13, q5, q1               @ w8 * inr33\n"
-          "ldr r5,    [%[outl], #36]            @ load flag_relu to r5\n"
-          "vmla.f32   q14, q5, q2               @ w8 * inr34\n"
-          "vmla.f32   q15, q5, q3               @ w8 * inr35\n"
-          "ldr r1,    [%[outl], #4]             @ load outc10 to r1\n"
-          "vadd.f32   q8, q8, q6                @ r00 add bias\n"
-          "vadd.f32   q9, q9, q6                @ r01 add bias\n"
-          "vadd.f32   q10, q10, q6              @ r02 add bias\n"
-          "vadd.f32   q11, q11, q6              @ r03 add bias\n"
-          "ldr r2,    [%[outl], #8]             @ load outc20 to r2\n"
-          "vadd.f32   q12, q12, q6              @ r10 add bias\n"
-          "vadd.f32   q13, q13, q6              @ r11 add bias\n"
-          "vadd.f32   q14, q14, q6              @ r12 add bias\n"
-          "vadd.f32   q15, q15, q6              @ r13 add bias\n"
-          "ldr r3,    [%[outl], #12]            @ load outc30 to r3\n"
-          "vmov.u32   q7, #0                    @ mov zero to q7\n"
-          "cmp  r5, #0                          @ cmp flag relu\n"
-          "beq  1f                              @ skip relu\n"
-          "vmax.f32  q8, q8, q7                 @ r00 relu\n"
-          "vmax.f32  q9, q9, q7                 @ r01 relu\n"
-          "vmax.f32  q10, q10, q7               @ r02 relu\n"
-          "vmax.f32  q11, q11, q7               @ r03 relu\n"
-          "vmax.f32  q12, q12, q7               @ r10 relu\n"
-          "vmax.f32  q13, q13, q7               @ r11 relu\n"
-          "vmax.f32  q14, q14, q7               @ r12 relu\n"
-          "vmax.f32  q15, q15, q7               @ r13 relu\n"
-          "1:\n"
-          "ldr r4,   [%[outl], #16]   @ load outc01 to r4\n"
-          "vtrn.32   q8, q9           @ r0: q8 : a0a1c0c1, q9 : b0b1d0d1\n"
-          "vtrn.32   q10, q11         @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n"
-          "vtrn.32   q12, q13         @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n"
-          "vtrn.32   q14, q15         @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n"
-          "ldr r5,   [%[outl], #20]   @ load outc11 to r5\n"
-          "vswp      d17, d20         @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n"
-          "vswp      d19, d22         @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n"
-          "vswp      d25, d28         @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n"
-          "vswp      d27, d30         @ r1: q13: b0b1b2b3, q15: d0d1d2d3 \n"
-          "cmp %[flag_mask], #0       @ cmp flag mask\n"
-          "bne 2f\n"
-          "vst1.32   {d16-d17}, [r0]  @ save outc00\n"
-          "vst1.32   {d18-d19}, [r1]  @ save outc10\n"
-          "vst1.32   {d20-d21}, [r2]  @ save outc20\n"
-          "vst1.32   {d22-d23}, [r3]  @ save outc30\n"
-          "vst1.32   {d24-d25}, [r4]  @ save outc01\n"
-          "vst1.32   {d26-d27}, [r5]  @ save outc11\n"
-          "ldr r0,   [%[outl], #24]   @ load outc21 to r0\n"
-          "ldr r1,   [%[outl], #28]   @ load outc31 to r1\n"
-          "vst1.32   {d28-d29}, [r0]  @ save outc21\n"
-          "vst1.32   {d30-d31}, [r1]  @ save outc31\n"
-          "b 3f                       @ branch end\n"
-          "2: \n"
-          "vst1.32 {d16-d17}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d18-d19}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d20-d21}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d22-d23}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d24-d25}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d26-d27}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d28-d29}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d30-d31}, [%[out0]]!  @ save remain to pre_out\n"
-          "3: \n"
-          : [r0] "+r"(inr0), [r1] "+r"(inr1),
-            [r2] "+r"(inr2), [r3] "+r"(inr3),
-            [out0] "+r"(out0), [wc0] "+r"(weight_c)
-          : [flag_mask] "r" (flag_mask), [outl] "r" (outl_ptr)
-          : "cc", "memory",
-            "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-            "q10", "q11", "q12", "q13","q14", "q15", "r0", "r1", "r2", "r3", "r4", "r5"
-          );
-#endif  //  __arch64__
-          // clang-format on
-          outl[0] += 4;
-          outl[1] += 4;
-          outl[2] += 4;
-          outl[3] += 4;
-          outl[4] += 4;
-          outl[5] += 4;
-          outl[6] += 4;
-          outl[7] += 4;
-          if (flag_mask) {
-            memcpy(outl[0] - 4, pre_out, remain * sizeof(float));
-            memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float));
-            memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float));
-            memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float));
-            memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float));
-            memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float));
-            memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float));
-            memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float));
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
index 6a1fa37681585883280625a22c15aec43c6554af..5cee02b639af7e04a9184af765a5e96be4cb4cdb 100644
--- a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
@@ -35,9 +35,10 @@ size_t conv3x3s1_direct_workspace_size(const operators::ConvParam& param,
   auto dim_in = param.x->dims();
   auto dim_out = param.output->dims();
   const int threads = ctx->threads();
+  auto paddings = *param.paddings;
   int llc_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
   int ow = dim_out[3];
   int oh = dim_out[2];
   int ic = dim_in[1];
@@ -74,9 +75,11 @@ void conv_3x3s1_direct_fp32(const float* i_data,
                             ARMContext* ctx) {
   const int threads = ctx->threads();
   int l2_size = ctx->llc_size() / sizeof(float);
+  auto paddings = *param.paddings;
+  auto act_param = param.activation_param;
 
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
   const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
   const int win_round = wout_round + 2;
   bool flag_relu = param.fuse_relu;
@@ -467,7 +470,8 @@ void conv_3x3s1_direct_fp32(const float* i_data,
                                 oh,
                                 ow,
                                 flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
       }
       const float* weight_remain_ptr = weights + c_round_down * w_stride;
 #pragma omp parallel for num_threads(threads)
@@ -778,7 +782,8 @@ void conv_3x3s1_direct_fp32(const float* i_data,
                                 oh,
                                 ow,
                                 flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
       }
     }
   }
diff --git a/lite/backends/arm/math/conv3x3s1_direct_int8.cc b/lite/backends/arm/math/conv3x3s1_direct_int8.cc
index f966313e118acf3f74124aca1d16aa3c50009bb8..64e72bc441bb93fa955e12ff53ce17f0e37b4830 100644
--- a/lite/backends/arm/math/conv3x3s1_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_int8.cc
@@ -41,10 +41,11 @@ void conv_3x3s1_direct_int8(const int8_t* din,
                             const operators::ConvParam& param,
                             Context<TARGET(kARM)>* ctx,
                             const float* scale) {
+  auto paddings = *param.paddings;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
 
   const int threads = ctx->threads();
   int llc_size = ctx->llc_size() / 4;
diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
new file mode 100644
index 0000000000000000000000000000000000000000..66d61413fc43fd518e0b34c7bc8d7b7bf5cc72a7
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -0,0 +1,4094 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_depthwise.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void conv_depthwise_3x3s1p0_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 const operators::ActivationParam act_param,
+                                 ARMContext *ctx);
+
+void conv_depthwise_3x3s1p0_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   const operators::ActivationParam act_param,
+                                   ARMContext *ctx);
+
+void conv_depthwise_3x3s1p1_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 const operators::ActivationParam act_param,
+                                 ARMContext *ctx);
+
+void conv_depthwise_3x3s1p1_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   const operators::ActivationParam act_param,
+                                   ARMContext *ctx);
+
+void conv_depthwise_3x3s1_fp32(const float *din,
+                               float *dout,
+                               int num,
+                               int ch_out,
+                               int h_out,
+                               int w_out,
+                               int ch_in,
+                               int h_in,
+                               int w_in,
+                               const float *weights,
+                               const float *bias,
+                               int pad,
+                               bool flag_bias,
+                               const operators::ActivationParam act_param,
+                               ARMContext *ctx) {
+  if (pad == 0) {
+    if (w_in > 5) {
+      conv_depthwise_3x3s1p0_bias(dout,
+                                  din,
+                                  weights,
+                                  bias,
+                                  flag_bias,
+                                  num,
+                                  ch_in,
+                                  h_in,
+                                  w_in,
+                                  h_out,
+                                  w_out,
+                                  act_param,
+                                  ctx);
+    } else {
+      conv_depthwise_3x3s1p0_bias_s(dout,
+                                    din,
+                                    weights,
+                                    bias,
+                                    flag_bias,
+                                    num,
+                                    ch_in,
+                                    h_in,
+                                    w_in,
+                                    h_out,
+                                    w_out,
+                                    act_param,
+                                    ctx);
+    }
+  }
+  if (pad == 1) {
+    if (w_in > 4) {
+      conv_depthwise_3x3s1p1_bias(dout,
+                                  din,
+                                  weights,
+                                  bias,
+                                  flag_bias,
+                                  num,
+                                  ch_in,
+                                  h_in,
+                                  w_in,
+                                  h_out,
+                                  w_out,
+                                  act_param,
+                                  ctx);
+    } else {
+      conv_depthwise_3x3s1p1_bias_s(dout,
+                                    din,
+                                    weights,
+                                    bias,
+                                    flag_bias,
+                                    num,
+                                    ch_in,
+                                    h_in,
+                                    w_in,
+                                    h_out,
+                                    w_out,
+                                    act_param,
+                                    ctx);
+    }
+  }
+}
+
+#ifdef __aarch64__
+#define INIT_S1                                                   \
+  "PRFM PLDL1KEEP, [%[din_ptr0]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr1]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr2]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr3]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr4]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr5]] \n"                              \
+  "movi   v21.4s, #0x0\n" /* out0 = 0 */                          \
+                                                                  \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+                                                                  \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/       \
+                                                                  \
+  "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
+
+#define LEFT_COMPUTE_S1                                                   \
+  "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */         \
+  "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/   \
+                                                                          \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/         \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/         \
+  "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */                \
+  "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */                \
+                                                                          \
+  "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \
+                                                                          \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/           \
+  "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */                  \
+  "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */                  \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/  \
+  "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */                  \
+  "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */                  \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext v17.16b, v4.16b, v5.16b, #4 \n"         /* v16=1234 */             \
+  "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/           \
+                                                                          \
+  /* r2 */                                                                \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
+                                                                          \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */         \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
+                                                                          \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ /* r4 */         \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/
+
+#define LEFT_RESULT_S1                                                      \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n"    /* vst1q_f32() */                  \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n"    /* vst1q_f32() */                  \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/            \
+                                                                            \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/   \
+                                                                            \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/             \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/   \
+                                                                            \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n"            /* v16 = 00123*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */         \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/   \
+                                                                            \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n"    /* vst1q_f32() */                  \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/            \
+                                                                            \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
+                                                                            \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/             \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
+                                                                            \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                     \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                    \
+                                                                            \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                     \
+  "cmp  %w[cnt], #1                \n"                                      \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "blt 3f                         \n"
+
+#define MID_COMPUTE_S1                                                    \
+  "1:                             \n"   /* r0 */                          \
+  "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v6.16b, v7.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ /* r3 */         \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
+
+#define MID_RESULT_S1                                                      \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                           \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+                                                                           \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "bne 1b \n"
+
+#define RIGHT_COMPUTE_S1                                                  \
+  "3:                             \n"                                     \
+  "movi v20.4s, #0 \n"                                                    \
+  "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"                           \
+  "ld1 {v22.4s}, [%[doutr0]]         \n"                                  \
+  "ld1 {v23.4s}, [%[doutr1]]         \n"                                  \
+  "ld1 {v24.4s}, [%[doutr2]]         \n"                                  \
+  "ld1 {v25.4s}, [%[doutr3]]         \n"                                  \
+                                                                          \
+  "bif v0.16b, v20.16b, v18.16b \n"                                       \
+  "bif v1.16b, v20.16b, v19.16b \n"                                       \
+  "bif v2.16b, v20.16b, v18.16b \n"                                       \
+  "bif v3.16b, v20.16b, v19.16b \n"                                       \
+                                                                          \
+  "bif v4.16b, v20.16b, v18.16b \n"                                       \
+  "bif v5.16b, v20.16b, v19.16b \n"                                       \
+  "bif v6.16b, v20.16b, v18.16b \n"                                       \
+  "bif v7.16b, v20.16b, v19.16b \n"                                       \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */         \
+  "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                          \
+  "bif v8.16b, v20.16b, v18.16b \n"                                       \
+  "bif v9.16b, v20.16b, v19.16b \n"                                       \
+  "bif v10.16b, v20.16b, v18.16b \n"                                      \
+  "bif v11.16b, v20.16b, v19.16b \n"                                      \
+                                                                          \
+  "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                          \
+  "ld1 {v18.4s}, [%[rmask]]         \n"                                   \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v6.16b, v7.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ /* r3 */         \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/
+
+#define RIGHT_RESULT_S1                                                    \
+  "bif v12.16b, v22.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "bif v13.16b, v23.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                           \
+  "bif v14.16b, v24.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "bif v15.16b, v25.16b, v18.16b \n"                                       \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define LEFT_RESULT_S1_RELU                                               \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                          \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */                   \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/          \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */          \
+  "ld1 {v12.4s}, [%[bias_val]]      \n"         /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+                                                                          \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/              \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+                                                                          \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                  \
+                                                                          \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                   \
+  "cmp  %w[cnt], #1                \n"                                    \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+  "blt 3f                         \n"
+
+#define LEFT_RESULT_S1_RELU6                                              \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                          \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/                          \
+  "fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/                          \
+                                                                          \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n"           /* vst1q_f32() */         \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n"           /* vst1q_f32() */         \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/          \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */          \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \
+  "ld1 {v12.4s}, [%[bias_val]]      \n"  /*vdupq_n_f32(bias_val)*/        \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \
+                                                                          \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+                                                                          \
+  "fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/                          \
+                                                                          \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/              \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+                                                                          \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                  \
+                                                                          \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                          \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+                                                                          \
+  "fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/                          \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n"  /* vst1q_f32() */                  \
+  "cmp  %w[cnt], #1                \n"                                    \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+  "blt 3f \n"
+
+#define LEFT_RESULT_S1_LEAKY_RELU                                         \
+  "fcmge v18.4s, v12.4s,  %[vzero].4s \n" /* vcgeq_f32 */                 \
+  "fcmge v19.4s, v13.4s,  %[vzero].4s \n" /* vcgeq_f32 */                 \
+  "fmul v20.4s, v12.4s, %[vscale].4s \n"  /* mul */                       \
+  "fmul v21.4s, v13.4s, %[vscale].4s \n"  /* mul */                       \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/         \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "bif  v12.16b, v20.16b, v18.16b \n" /* choose*/                         \
+  "bif  v13.16b, v21.16b, v19.16b \n" /* choose*/                         \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"   /*vld1q_f32(din_ptr0)*/             \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/          \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */          \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n"           /* vst1q_f32() */         \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n"           /* vst1q_f32() */         \
+                                                                          \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \
+  "fcmge v18.4s, v14.4s,  %[vzero].4s \n" /* vcgeq_f32 */                 \
+  "fmul v20.4s, v14.4s, %[vscale].4s \n"  /* mul */                       \
+                                                                          \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+                                                                          \
+  "bif  v14.16b, v20.16b, v18.16b \n" /* choose*/                         \
+                                                                          \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/              \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+                                                                          \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                  \
+                                                                          \
+  "fcmge v18.4s, v15.4s,  %[vzero].4s \n" /* vcgeq_f32 */                 \
+  "fmul v20.4s, v15.4s, %[vscale].4s \n"  /* mul */                       \
+  "ld1 {v14.4s}, [%[bias_val]]      \n"   /*vdupq_n_f32(bias_val)*/       \
+  "bif  v15.16b, v20.16b, v18.16b \n"     /* choose*/                     \
+  "cmp  %w[cnt], #1                \n"                                    \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n"   /* vst1q_f32() */                 \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+  "blt 3f                         \n"
+
+#define MID_RESULT_S1_RELU                                                 \
+  "movi v20.4s, #0 \n"                                                     \
+  "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "fmax v13.4s, v13.4s, v20.4s \n"       /*relu*/                          \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+                                                                           \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "fmax v14.4s, v14.4s, v20.4s \n"        /*relu*/                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+                                                                           \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "fmax v15.4s, v15.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "bne 1b \n"
+
+#define MID_RESULT_S1_RELU6                                                \
+  "movi v20.4s, #0 \n"                                                     \
+  "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/                \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v12.4s}, [%[bias_val]]      \n"  /*vdupq_n_f32(bias_val)*/         \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "fmax v13.4s, v13.4s, v20.4s \n"       /*relu*/                          \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/                \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "ld1 {v13.4s}, [%[bias_val]]      \n"   /*vdupq_n_f32(bias_val)*/        \
+  "fmax v14.4s, v14.4s, v20.4s \n"        /*relu*/                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmax v15.4s, v15.4s, v20.4s \n"      /*relu*/                           \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/                           \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "bne 1b \n"
+
+#define MID_RESULT_S1_LEAKY_RELU                                           \
+  "movi v21.4s, #0 \n"                                                     \
+  "fcmge v18.4s, v12.4s,  v21.4s \n"     /* vcgeq_f32 */                   \
+  "fmul v20.4s, v12.4s, %[vscale].4s \n" /* mul */                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif  v12.16b, v20.16b, v18.16b \n" /* choose*/                          \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/                \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "fcmge v18.4s, v13.4s,  v21.4s \n"     /* vcgeq_f32 */                   \
+  "fmul v20.4s, v13.4s, %[vscale].4s \n" /* mul */                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+  "bif  v13.16b, v20.16b, v18.16b \n"   /* choose*/                        \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/                \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "ld1 {v13.4s}, [%[bias_val]]      \n"   /*vdupq_n_f32(bias_val)*/        \
+  "fcmge v18.4s, v14.4s,  v21.4s \n"      /* vcgeq_f32 */                  \
+  "fmul v20.4s, v14.4s, %[vscale].4s \n"  /* mul */                        \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif  v14.16b, v20.16b, v18.16b \n" /* choose*/                          \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fcmge v18.4s, v15.4s,  v21.4s \n"     /* vcgeq_f32 */                   \
+  "fmul v20.4s, v15.4s, %[vscale].4s \n" /* mul */                         \
+                                                                           \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+  "bif  v15.16b, v20.16b, v18.16b \n"   /* choose*/                        \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "bne 1b \n"
+
+#define RIGHT_RESULT_S1_RELU                                               \
+  "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v12.16b, v22.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+  "fmax v13.4s, v13.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v13.16b, v23.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                         \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+                                                                           \
+  "fmax v14.4s, v14.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v14.16b, v24.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmax v15.4s, v15.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "bif v15.16b, v25.16b, v18.16b \n"                                       \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define RIGHT_RESULT_S1_RELU6                                              \
+  "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "bif v12.16b, v22.16b, v18.16b \n"                                       \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmax v13.4s, v13.4s, v20.4s \n"      /*relu*/                           \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+  "bif v13.16b, v23.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v10.4s,   %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
+                                                                           \
+  "fmax v14.4s, v14.4s, v20.4s \n"        /*relu*/                         \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "bif v14.16b, v24.16b, v18.16b \n"                                       \
+  "fmax v15.4s, v15.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/                           \
+  "bif v15.16b, v25.16b, v18.16b \n"                                       \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define RIGHT_RESULT_S1_LEAKY_RELU                                        \
+  "movi v1.4s, #0 \n"                                                     \
+  "fcmge v20.4s, v12.4s,  v1.4s \n"      /* vcgeq_f32 */                  \
+  "fmul v21.4s, v12.4s, %[vscale].4s \n" /* mul */                        \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "bif  v12.16b, v21.16b, v20.16b \n" /* choose*/                         \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */         \
+  "bif v12.16b, v22.16b, v18.16b \n"                                      \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "fcmge v20.4s, v13.4s,  v1.4s \n"      /* vcgeq_f32 */                  \
+  "fmul v21.4s, v13.4s, %[vscale].4s \n" /* mul */                        \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                 \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "bif v13.16b, v21.16b, v20.16b \n"                                      \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                 \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                \
+                                                                          \
+  "bif v13.16b, v23.16b, v18.16b \n"                                      \
+                                                                          \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
+                                                                          \
+  "fcmge v20.4s, v14.4s,  v1.4s \n"       /* vcgeq_f32 */                 \
+  "fmul v21.4s, v14.4s, %[vscale].4s \n"  /* mul */                       \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                        \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "bif v14.16b, v21.16b, v20.16b \n"                                      \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "bif v14.16b, v24.16b, v18.16b \n"                                      \
+                                                                          \
+  "fcmge v20.4s, v15.4s,  v1.4s \n"      /* vcgeq_f32 */                  \
+  "fmul v21.4s, v15.4s, %[vscale].4s \n" /* mul */                        \
+                                                                          \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                 \
+  "bif v15.16b, v21.16b, v20.16b \n"                                      \
+  "bif v15.16b, v25.16b, v18.16b \n"                                      \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define COMPUTE_S_S1                        \
+  "prfm pldl1keep, [%[din0]]\n"             \
+  "prfm pldl1keep, [%[din1]]\n"             \
+  "prfm pldl1keep, [%[din2]]\n"             \
+  "prfm pldl1keep, [%[din3]]\n"             \
+                                            \
+  "ld1 {v0.4s}, [%[din0]], #16\n"           \
+  "ld1 {v1.4s}, [%[din1]], #16\n"           \
+  "ld1 {v2.4s}, [%[din2]], #16\n"           \
+  "ld1 {v3.4s}, [%[din3]], #16\n"           \
+                                            \
+  "bif v0.16b, %[vzero].16b, %[mask].16b\n" \
+  "bif v1.16b, %[vzero].16b, %[mask].16b\n" \
+  "bif v2.16b, %[vzero].16b, %[mask].16b\n" \
+  "bif v3.16b, %[vzero].16b, %[mask].16b\n" \
+                                            \
+  "ext v4.16b, %[vzero].16b, v0.16b, #12\n" \
+  "ext v5.16b, %[vzero].16b, v1.16b, #12\n" \
+  "ext v6.16b, %[vzero].16b, v2.16b, #12\n" \
+  "ext v7.16b, %[vzero].16b, v3.16b, #12\n" \
+                                            \
+  "ext v8.16b, v0.16b, %[vzero].16b, #4\n"  \
+  "ext v9.16b, v1.16b, %[vzero].16b, #4\n"  \
+  "ext v10.16b, v2.16b, %[vzero].16b, #4\n" \
+  "ext v11.16b, v3.16b, %[vzero].16b, #4\n" \
+                                            \
+  "fmul v12.4s, v0.4s, %[wr0].s[1]\n"       \
+  "fmul v13.4s, v1.4s, %[wr0].s[1]\n"       \
+                                            \
+  "fmul v14.4s, v1.4s, %[wr1].s[1]\n"       \
+  "fmul v15.4s, v2.4s, %[wr1].s[1]\n"       \
+                                            \
+  "fmul v16.4s, v2.4s, %[wr2].s[1]\n"       \
+  "fmul v17.4s, v3.4s, %[wr2].s[1]\n"       \
+                                            \
+  "fmla v12.4s, v4.4s, %[wr0].s[0]\n"       \
+  "fmla v13.4s, v5.4s, %[wr0].s[0]\n"       \
+                                            \
+  "fmla v14.4s, v5.4s, %[wr1].s[0]\n"       \
+  "fmla v15.4s, v6.4s, %[wr1].s[0]\n"       \
+                                            \
+  "fmla v16.4s, v6.4s, %[wr2].s[0]\n"       \
+  "fmla v17.4s, v7.4s, %[wr2].s[0]\n"       \
+                                            \
+  "fmla v12.4s, v8.4s, %[wr0].s[2]\n"       \
+  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"       \
+                                            \
+  "fmla v14.4s, v9.4s, %[wr1].s[2]\n"       \
+  "fmla v15.4s, v10.4s, %[wr1].s[2]\n"      \
+                                            \
+  "fmla v16.4s, v10.4s, %[wr2].s[2]\n"      \
+  "fmla v17.4s, v11.4s, %[wr2].s[2]\n"      \
+                                            \
+  "fadd v12.4s, v12.4s, v14.4s\n"           \
+  "fadd v12.4s, v12.4s, v16.4s\n"           \
+                                            \
+  "fadd v13.4s, v13.4s, v15.4s\n"           \
+  "fadd v13.4s, v13.4s, v17.4s\n"           \
+                                            \
+  "fadd v12.4s, v12.4s, %[bias].4s\n"       \
+  "fadd v13.4s, v13.4s, %[bias].4s\n"
+
+#define RESULT_S_S1             \
+  "prfm pldl1keep, [%[out1]]\n" \
+  "prfm pldl1keep, [%[out2]]\n" \
+                                \
+  "st1 {v12.4s}, [%[out1]]\n"   \
+  "st1 {v13.4s}, [%[out2]]\n"
+
+#define RESULT_S_S1_RELU               \
+  "prfm pldl1keep, [%[out1]]\n"        \
+  "prfm pldl1keep, [%[out2]]\n"        \
+                                       \
+  "fmax v12.4s, v12.4s, %[vzero].4s\n" \
+  "fmax v13.4s, v13.4s, %[vzero].4s\n" \
+                                       \
+  "st1 {v12.4s}, [%[out1]]\n"          \
+  "st1 {v13.4s}, [%[out2]]\n"
+
+#define RESULT_S_S1_RELU6              \
+  "prfm pldl1keep, [%[out1]]\n"        \
+  "prfm pldl1keep, [%[out2]]\n"        \
+                                       \
+  "fmax v12.4s, v12.4s, %[vzero].4s\n" \
+  "fmax v13.4s, v13.4s, %[vzero].4s\n" \
+                                       \
+  "fmin v12.4s, v12.4s, %[vsix].4s\n"  \
+  "fmin v13.4s, v13.4s, %[vsix].4s\n"  \
+                                       \
+  "st1 {v12.4s}, [%[out1]]\n"          \
+  "st1 {v13.4s}, [%[out2]]\n"
+
+#define RESULT_S_S1_LEAKY_RELU                            \
+  "prfm pldl1keep, [%[out1]]\n"                           \
+  "prfm pldl1keep, [%[out2]]\n"                           \
+                                                          \
+  "fcmge v18.4s, v12.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fcmge v19.4s, v13.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v20.4s, v12.4s, %[vscale].4s \n"  /* mul */       \
+  "fmul v21.4s, v13.4s, %[vscale].4s \n"  /* mul */       \
+                                                          \
+  "bif v12.16b, v20.16b, v18.16b \n"                      \
+  "bif v13.16b, v21.16b, v19.16b \n"                      \
+  "st1 {v12.4s}, [%[out1]]\n"                             \
+  "st1 {v13.4s}, [%[out2]]\n"
+#define COMPUTE_S_S1_P0                                   \
+  "prfm pldl1keep, [%[din0]]\n"                           \
+  "prfm pldl1keep, [%[din1]]\n"                           \
+  "prfm pldl1keep, [%[din2]]\n"                           \
+  "prfm pldl1keep, [%[din3]]\n"                           \
+                                                          \
+  "ld1 {v0.4s, v1.4s}, [%[din0]]\n"                       \
+  "ld1 {v2.4s, v3.4s}, [%[din1]]\n"                       \
+  "ld1 {v4.4s, v5.4s}, [%[din2]]\n"                       \
+  "ld1 {v6.4s, v7.4s}, [%[din3]]\n"                       \
+                                                          \
+  "bif v0.16b, %[vzero].16b, %[mask1].16b\n"              \
+  "bif v1.16b, %[vzero].16b, %[mask2].16b\n"              \
+                                                          \
+  "bif v2.16b, %[vzero].16b, %[mask1].16b\n"              \
+  "bif v3.16b, %[vzero].16b, %[mask2].16b\n"              \
+                                                          \
+  "bif v4.16b, %[vzero].16b, %[mask1].16b\n"              \
+  "bif v5.16b, %[vzero].16b, %[mask2].16b\n"              \
+                                                          \
+  "bif v6.16b, %[vzero].16b, %[mask1].16b\n"              \
+  "bif v7.16b, %[vzero].16b, %[mask2].16b\n"              \
+                                                          \
+  "ext v8.16b, v0.16b, v1.16b, #4\n"                      \
+  "ext v9.16b, v0.16b, v1.16b, #8\n"                      \
+                                                          \
+  "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"          \
+  "and  v13.16b, %[vbias].16b, %[vbias].16b  \n" /* r0 */ \
+  "fmul v10.4s, v0.4s, %[wr0].s[0]\n"                     \
+  "fmul v11.4s, v8.4s, %[wr0].s[1]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr0].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v2.16b, v3.16b, #4\n"                      \
+  "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */             \
+  "fmul v14.4s, v2.4s, %[wr0].s[0]\n"                     \
+  "fmla v10.4s, v2.4s, %[wr1].s[0]\n"                     \
+                                                          \
+  "fmul v15.4s, v8.4s, %[wr0].s[1]\n"                     \
+  "fmla v11.4s, v8.4s, %[wr1].s[1]\n"                     \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr1].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v4.16b, v5.16b, #4\n"                      \
+  "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */             \
+  "fmla v14.4s, v4.4s, %[wr1].s[0]\n"                     \
+  "fmla v10.4s, v4.4s, %[wr2].s[0]\n"                     \
+                                                          \
+  "fmla v15.4s, v8.4s, %[wr1].s[1]\n"                     \
+  "fmla v11.4s, v8.4s, %[wr2].s[1]\n"                     \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr1].s[2]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr2].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v6.16b, v7.16b, #4\n"                      \
+  "ext v9.16b, v6.16b, v7.16b, #8\n"                      \
+                                                          \
+  "fmla v14.4s, v6.4s, %[wr2].s[0]\n"                     \
+                                                          \
+  "fmla v15.4s, v8.4s, %[wr2].s[1]\n"                     \
+                                                          \
+  "fadd v12.4s, v12.4s, v10.4s\n"                         \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr2].s[2]\n"                     \
+                                                          \
+  "fadd v12.4s, v12.4s, v11.4s\n"                         \
+  "fadd v13.4s, v13.4s, v14.4s\n"                         \
+  "fadd v13.4s, v13.4s, v15.4s\n"  // \
+                    // "prfm pldl1keep, [%[out1]]\n" \
+                    // "prfm pldl1keep, [%[out2]]\n" \
+                    // \
+                    // "st1 {v12.4s}, [%[out1]]\n" \
+                    // "st1 {v13.4s}, [%[out2]]\n" \
+
+#else
+#define INIT_S1                                                    \
+  "pld [%[din0_ptr]]                             @ preload data\n" \
+  "pld [%[din1_ptr]]                      @ preload data\n"        \
+  "pld [%[din2_ptr]]                      @ preload data\n"        \
+  "pld [%[din3_ptr]]                      @ preload data\n"        \
+                                                                   \
+  "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"          \
+  "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"          \
+  "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"          \
+  "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"          \
+                                                                   \
+  "vdup.32 q4, %[bias_val]                            @ and \n"    \
+  "vdup.32 q5, %[bias_val]                            @ and \n"
+
+#define LEFT_COMPUTE_S1                                            \
+  "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"                    \
+  "vext.32  q7, q8, q9, #1     @ 1234\n" /* r0 */                  \
+  "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"           \
+                                                                   \
+  "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+                                                                   \
+  "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "pld [%[din0_ptr]]                             @ preload data\n" \
+  "pld [%[din1_ptr]]                             @ preload data\n" \
+  "pld [%[din2_ptr]]                             @ preload data\n" \
+  "pld [%[din3_ptr]]                             @ preload data\n" \
+                                                                   \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"                   \
+  "vext.32  q7, q10, q11, #1     @ 1234\n"                         \
+                                                                   \
+  /* r1 */                                                         \
+  "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"          \
+  "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
+                                                                   \
+  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"          \
+  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"          \
+                                                                   \
+  "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
+  "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"               \
+  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"               \
+                                                                   \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"                   \
+  "vext.32  q7, q12, q13, #1     @ 1234\n"                         \
+                                                                   \
+  /* r2 */                                                         \
+  "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
+  "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"          \
+                                                                   \
+  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"          \
+                                                                   \
+  "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
+  "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"               \
+                                                                   \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"                   \
+  "vext.32  q7, q14, q15, #1     @ 1234\n"
+
+#define LEFT_RESULT_S1                                                        \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+                                                                              \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
+#define MID_COMPUTE_S1                                                 \
+  "1:                                    @ right pad entry\n" /* r0 */ \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                       \
+  "pld [%[din0_ptr]]                             @ preload data\n"     \
+  "pld [%[din1_ptr]]                             @ preload data\n"     \
+  "pld [%[din2_ptr]]                             @ preload data\n"     \
+  "pld [%[din3_ptr]]                             @ preload data\n"     \
+                                                                       \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"               \
+                                                                       \
+  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                             \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                    \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"              \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"              \
+                                                                       \
+  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                             \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                    \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"              \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"              \
+                                                                       \
+  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                             \
+  "vext.32  q7, q14, q15, #2     @ 2345\n"
+
+#define MID_RESULT_S1                                                    \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+                                                                         \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+                                                                         \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+                                                                         \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
+#define RIGHT_COMPUTE_S1                                                      \
+  "3:                                    @ right pad entry\n"                 \
+  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
+                                                                              \
+  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"                     \
+  "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"                     \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
+  "vext.32  q7, q14, q15, #2     @ 2345\n"
+
+#define RIGHT_RESULT_S1                                                 \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+                                                                        \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
+#define LEFT_RESULT_S1_RELU                                                   \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                                    \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                                    \
+                                                                              \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+                                                                              \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
+#define LEFT_RESULT_S1_RELU6                                                  \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.f32 {d28-d29}, [%[six_ptr]] @ load six \n"                            \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                                    \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+                                                                              \
+  "vmin.f32 q4, q4, q14 @ relu6 \n"                                           \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+                                                                              \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                                    \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+  "vmin.f32 q5, q5, q14 @ relu6 \n"                                           \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
+#define LEFT_RESULT_S1_LEAKY_RELU                                             \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+  "vld1.f32 {d28-d29}, [%[scale_ptr]] @ load scale \n"                        \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+  "vcge.f32 q15, q4, %q[vzero]        @ q0 > 0 \n"                            \
+  "vmul.f32 q6, q4, q14 \n"                                                   \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vbif q4, q6, q15 @ choose \n"                                              \
+  "vcge.f32 q7, q5, %q[vzero]        @ q0 > 0 \n"                             \
+  "vmul.f32 q6, q5, q14 \n"                                                   \
+                                                                              \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vbif q5, q6, q7 @ choose \n"                                               \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+                                                                              \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
+#define MID_RESULT_S1_RELU                                               \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                               \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+                                                                         \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+                                                                         \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                               \
+                                                                         \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+                                                                         \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
+#define MID_RESULT_S1_RELU6                                              \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[six_ptr]]    @ load din r0\n"                  \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                               \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vmin.f32 q4, q4, q14             @ relu6 \n"                          \
+                                                                         \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+                                                                         \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+                                                                         \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                               \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vmin.f32 q5, q5, q14             @ relu6 \n"                          \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+                                                                         \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
+#define MID_RESULT_S1_LEAKY_RELU                                         \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[scale_ptr]]    @ load din r0\n"                \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vcge.f32 q15, q4, %q[vzero]        @ q0 > 0 \n"                       \
+  "vmul.f32 q6, q4, q14 \n"                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+                                                                         \
+  "vbif q4, q6, q15 @ choose \n"                                         \
+  "vcge.f32 q7, q5, %q[vzero]        @ q0 > 0 \n"                        \
+  "vmul.f32 q6, q5, q14 \n"                                              \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+                                                                         \
+  "vbif q5, q6, q7 @ choose \n"                                          \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+                                                                         \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
+#define RIGHT_RESULT_S1_RELU                                            \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                              \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                              \
+                                                                        \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
+#define RIGHT_RESULT_S1_RELU6                                           \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vld1.32  {d28-d29}, [%[six_ptr]]    @ load din r0\n"                 \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                              \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vmin.f32 q4, q4, q14             @ relu6 \n"                         \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+                                                                        \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                              \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vmin.f32 q5, q5, q14             @ relu6 \n"                         \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
+#define RIGHT_RESULT_S1_LEAKY_RELU                                      \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vld1.32  {d28-d29}, [%[scale_ptr]]    @ load din r0\n"               \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vcge.f32 q15, q4, %q[vzero]        @ q0 > 0 \n"                      \
+  "vmul.f32 q6, q4, q14 \n"                                             \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+  "vbif q4, q6, q15 @ choose \n"                                        \
+                                                                        \
+  "vcge.f32 q7, q5, %q[vzero]        @ q0 > 0 \n"                       \
+  "vmul.f32 q6, q5, q14 \n"                                             \
+                                                                        \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+  "vbif q5, q6, q7 @ choose \n"                                         \
+                                                                        \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
+#define COMPUTE_S_S1                 \
+  "pld [%[din0]]\n"                  \
+  "pld [%[din1]]\n"                  \
+  "pld [%[din2]]\n"                  \
+  "pld [%[din3]]\n"                  \
+                                     \
+  "vld1.32 {d12-d13}, [%[din0]]!\n"  \
+  "vld1.32 {d14-d15}, [%[din1]]!\n"  \
+  "vld1.32 {d16-d17}, [%[din2]]!\n"  \
+  "vld1.32 {d18-d19}, [%[din3]]!\n"  \
+                                     \
+  "vbif q6, %q[vzero], %q[mask]\n"   \
+  "vbif q7, %q[vzero], %q[mask]\n"   \
+  "vbif q8, %q[vzero], %q[mask]\n"   \
+  "vbif q9, %q[vzero], %q[mask]\n"   \
+                                     \
+  "vmul.f32 q14, q6, %e[wr0][1]\n"   \
+  "vmul.f32 q15, q7, %e[wr0][1]\n"   \
+                                     \
+  "vmla.f32 q14, q7, %e[wr1][1]\n"   \
+  "vmla.f32 q15, q8, %e[wr1][1]\n"   \
+                                     \
+  "vmla.f32 q14, q8, %e[wr2][1]\n"   \
+  "vmla.f32 q15, q9, %e[wr2][1]\n"   \
+                                     \
+  "vext.32 q10, %q[vzero], q6, #3\n" \
+  "vext.32 q11, %q[vzero], q7, #3\n" \
+  "vext.32 q12, %q[vzero], q8, #3\n" \
+  "vext.32 q13, %q[vzero], q9, #3\n" \
+                                     \
+  "vmla.f32 q14, q10, %e[wr0][0]\n"  \
+  "vmla.f32 q15, q11, %e[wr0][0]\n"  \
+                                     \
+  "vmla.f32 q14, q11, %e[wr1][0]\n"  \
+  "vmla.f32 q15, q12, %e[wr1][0]\n"  \
+                                     \
+  "vmla.f32 q14, q12, %e[wr2][0]\n"  \
+  "vmla.f32 q15, q13, %e[wr2][0]\n"  \
+                                     \
+  "vext.32 q10, q6, %q[vzero], #1\n" \
+  "vext.32 q11, q7, %q[vzero], #1\n" \
+  "vext.32 q12, q8, %q[vzero], #1\n" \
+  "vext.32 q13, q9, %q[vzero], #1\n" \
+                                     \
+  "vmla.f32 q14, q10, %f[wr0][0]\n"  \
+  "vmla.f32 q15, q11, %f[wr0][0]\n"  \
+                                     \
+  "vmla.f32 q14, q11, %f[wr1][0]\n"  \
+  "vmla.f32 q15, q12, %f[wr1][0]\n"  \
+                                     \
+  "vmla.f32 q14, q12, %f[wr2][0]\n"  \
+  "vmla.f32 q15, q13, %f[wr2][0]\n"  \
+                                     \
+  "vadd.f32 q14, q14, %q[bias]\n"    \
+  "vadd.f32 q15, q15, %q[bias]\n"
+
+#define RESULT_S_S1                \
+  "pld [%[out1]]\n"                \
+  "pld [%[out2]]\n"                \
+                                   \
+  "vst1.32 {d28-d29}, [%[out1]]\n" \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
+#define RESULT_S_S1_RELU           \
+  "pld [%[out1]]\n"                \
+  "pld [%[out2]]\n"                \
+                                   \
+  "vmax.f32 q14, q14, %q[vzero]\n" \
+  "vmax.f32 q15, q15, %q[vzero]\n" \
+                                   \
+  "vst1.32 {d28-d29}, [%[out1]]\n" \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
+#define RESULT_S_S1_RELU6              \
+  "pld [%[out1]]\n"                    \
+  "pld [%[out2]]\n"                    \
+                                       \
+  "vld1.32 {d20-d21}, [%[six_ptr]] \n" \
+  "vmax.f32 q14, q14, %q[vzero]\n"     \
+  "vmax.f32 q15, q15, %q[vzero]\n"     \
+                                       \
+  "vmin.f32 q14, q14, q10 \n"          \
+  "vmin.f32 q15, q15, q10 \n"          \
+                                       \
+  "vst1.32 {d28-d29}, [%[out1]]\n"     \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
+#define RESULT_S_S1_LEAKY_RELU                      \
+  "pld [%[out1]]\n"                                 \
+  "pld [%[out2]]\n"                                 \
+                                                    \
+  "vld1.32 {d18-d19}, [%[scale_ptr]] \n"            \
+  "vcge.f32 q10, q14, %q[vzero]        @ q0 > 0 \n" \
+  "vcge.f32 q11, q15, %q[vzero]        @ q0 > 0 \n" \
+  "vmul.f32 q12, q14, q9 \n"                        \
+  "vmul.f32 q13, q15, q9 \n"                        \
+                                                    \
+  "vbif q14, q12, q10 \n"                           \
+  "vbif q15, q13, q11 \n"                           \
+                                                    \
+  "vst1.32 {d28-d29}, [%[out1]]\n"                  \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
+#define COMPUTE_S_S1_P0                                                       \
+  "pld [%[din0]]\n"                                                           \
+  "pld [%[din1]]\n"                                                           \
+  "pld [%[din2]]\n"                                                           \
+  "pld [%[din3]]\n"                                                           \
+  "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"                          \
+  "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"                          \
+  "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"                          \
+  "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"                          \
+                                                                              \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
+                                                                              \
+  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
+                                                                              \
+  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
+                                                                              \
+  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
+  "vext.32  q7, q14, q15, #2     @ 2345\n" /* r3 */                           \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                     \
+                                                                              \
+  "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vadd.f32 q4, q4, q10         @ q4 += q10 \n"                               \
+                                                                              \
+  "pld [%[out1]]\n"                                                           \
+  "pld [%[out2]]\n"                                                           \
+                                                                              \
+  "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                      \
+  "vadd.f32 q14, q4, q11         @ q4 += q10 \n"                              \
+                                                                              \
+  "vadd.f32 q5, q5, q8         @ q4 += q10 \n"                                \
+  "vadd.f32 q15, q5, q9         @ q4 += q10 \n"
+
+#endif
+
+#ifdef __aarch64__
+void act_switch_3x3s1p1(const float *din_ptr0,
+                        const float *din_ptr1,
+                        const float *din_ptr2,
+                        const float *din_ptr3,
+                        const float *din_ptr4,
+                        const float *din_ptr5,
+                        float *doutr0,
+                        float *doutr1,
+                        float *doutr2,
+                        float *doutr3,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        unsigned int *vmask,
+                        unsigned int *rmask,
+                        float32x4_t vzero,
+                        float *vbias,
+                        int cnt,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(
+            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [vzero] "w"(vzero)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(
+            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1
+                MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [vsix] "w"(vsix),
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [vzero] "w"(vzero)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
+                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
+                             RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
+                     : [cnt] "+r"(cnt),
+                       [din_ptr0] "+r"(din_ptr0),
+                       [din_ptr1] "+r"(din_ptr1),
+                       [din_ptr2] "+r"(din_ptr2),
+                       [din_ptr3] "+r"(din_ptr3),
+                       [din_ptr4] "+r"(din_ptr4),
+                       [din_ptr5] "+r"(din_ptr5),
+                       [doutr0] "+r"(doutr0),
+                       [doutr1] "+r"(doutr1),
+                       [doutr2] "+r"(doutr2),
+                       [doutr3] "+r"(doutr3)
+                     : [w0] "w"(wr0),
+                       [w1] "w"(wr1),
+                       [w2] "w"(wr2),
+                       [vscale] "w"(vscale),
+                       [bias_val] "r"(vbias),
+                       [vmask] "r"(vmask),
+                       [rmask] "r"(rmask),
+                       [vzero] "w"(vzero)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22",
+                       "v23",
+                       "v24",
+                       "v25");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
+                     MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+                 : [cnt] "+r"(cnt),
+                   [din_ptr0] "+r"(din_ptr0),
+                   [din_ptr1] "+r"(din_ptr1),
+                   [din_ptr2] "+r"(din_ptr2),
+                   [din_ptr3] "+r"(din_ptr3),
+                   [din_ptr4] "+r"(din_ptr4),
+                   [din_ptr5] "+r"(din_ptr5),
+                   [doutr0] "+r"(doutr0),
+                   [doutr1] "+r"(doutr1),
+                   [doutr2] "+r"(doutr2),
+                   [doutr3] "+r"(doutr3)
+                 : [w0] "w"(wr0),
+                   [w1] "w"(wr1),
+                   [w2] "w"(wr2),
+                   [bias_val] "r"(vbias),
+                   [vmask] "r"(vmask),
+                   [rmask] "r"(rmask),
+                   [vzero] "w"(vzero)
+                 : "cc",
+                   "memory",
+                   "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v12",
+                   "v13",
+                   "v14",
+                   "v15",
+                   "v16",
+                   "v17",
+                   "v18",
+                   "v19",
+                   "v20",
+                   "v21",
+                   "v22",
+                   "v23",
+                   "v24",
+                   "v25");
+  }
+}
+#else
+void act_switch_3x3s1p1(const float *din_ptr0,
+                        const float *din_ptr1,
+                        const float *din_ptr2,
+                        const float *din_ptr3,
+                        float *doutr0,
+                        float *doutr1,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        unsigned int *vmask_ptr,
+                        unsigned int *rmask_ptr,
+                        float32x4_t vzero,
+                        float bias_val,
+                        int cnt,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(
+            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+            : [dout_ptr1] "+r"(doutr0),
+              [dout_ptr2] "+r"(doutr1),
+              [din0_ptr] "+r"(din_ptr0),
+              [din1_ptr] "+r"(din_ptr1),
+              [din2_ptr] "+r"(din_ptr2),
+              [din3_ptr] "+r"(din_ptr3),
+              [cnt] "+r"(cnt),
+              [rmask] "+r"(rmask_ptr),
+              [vmask] "+r"(vmask_ptr)
+            : [wr0] "w"(wr0),
+              [wr1] "w"(wr1),
+              [wr2] "w"(wr2),
+              [bias_val] "r"(bias_val),
+              [vzero] "w"(vzero)
+            : "cc",
+              "memory",
+              "q4",
+              "q5",
+              "q6",
+              "q7",
+              "q8",
+              "q9",
+              "q10",
+              "q11",
+              "q12",
+              "q13",
+              "q14",
+              "q15");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(
+            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1
+                MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6
+            : [dout_ptr1] "+r"(doutr0),
+              [dout_ptr2] "+r"(doutr1),
+              [din0_ptr] "+r"(din_ptr0),
+              [din1_ptr] "+r"(din_ptr1),
+              [din2_ptr] "+r"(din_ptr2),
+              [din3_ptr] "+r"(din_ptr3),
+              [cnt] "+r"(cnt),
+              [rmask] "+r"(rmask_ptr),
+              [vmask] "+r"(vmask_ptr)
+            : [wr0] "w"(wr0),
+              [wr1] "w"(wr1),
+              [wr2] "w"(wr2),
+              [bias_val] "r"(bias_val),
+              [six_ptr] "r"(vsix),
+              [vzero] "w"(vzero)
+            : "cc",
+              "memory",
+              "q4",
+              "q5",
+              "q6",
+              "q7",
+              "q8",
+              "q9",
+              "q10",
+              "q11",
+              "q12",
+              "q13",
+              "q14",
+              "q15");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
+                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
+                             RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
+                       [din0_ptr] "+r"(din_ptr0),
+                       [din1_ptr] "+r"(din_ptr1),
+                       [din2_ptr] "+r"(din_ptr2),
+                       [din3_ptr] "+r"(din_ptr3),
+                       [cnt] "+r"(cnt),
+                       [rmask] "+r"(rmask_ptr),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias_val] "r"(bias_val),
+                       [scale_ptr] "r"(vscale),
+                       [vzero] "w"(vzero)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
+                     MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+                 : [dout_ptr1] "+r"(doutr0),
+                   [dout_ptr2] "+r"(doutr1),
+                   [din0_ptr] "+r"(din_ptr0),
+                   [din1_ptr] "+r"(din_ptr1),
+                   [din2_ptr] "+r"(din_ptr2),
+                   [din3_ptr] "+r"(din_ptr3),
+                   [cnt] "+r"(cnt),
+                   [rmask] "+r"(rmask_ptr),
+                   [vmask] "+r"(vmask_ptr)
+                 : [wr0] "w"(wr0),
+                   [wr1] "w"(wr1),
+                   [wr2] "w"(wr2),
+                   [bias_val] "r"(bias_val),
+                   [vzero] "w"(vzero)
+                 : "cc",
+                   "memory",
+                   "q4",
+                   "q5",
+                   "q6",
+                   "q7",
+                   "q8",
+                   "q9",
+                   "q10",
+                   "q11",
+                   "q12",
+                   "q13",
+                   "q14",
+                   "q15");
+  }
+}
+#endif
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width > 4
+ */
+void conv_depthwise_3x3s1p1_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 const operators::ActivationParam act_param,
+                                 ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = w_out >> 2;
+  int remain = w_out % 4;
+  int cnt_col = tile_w - 1;
+
+  unsigned int size_pad_right = (unsigned int)(5 + (tile_w << 2) - w_in);
+  const unsigned int remian_idx[4] = {0, 1, 2, 3};
+
+  if (remain == 0 && size_pad_right == 5) {
+    size_pad_right = 1;
+    cnt_col -= 1;
+    remain = 4;
+  } else if (remain == 0 && size_pad_right == 6) {
+    size_pad_right = 2;
+    cnt_col -= 1;
+    remain = 4;
+  }
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_u32(vdupq_n_u32(remain), vld1q_u32(remian_idx));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          din_ptr4 = dr3;
+          din_ptr5 = dr4;
+          dr0 = dr3;
+          dr1 = dr4;
+          dr2 = dr5;
+        } else {
+          dr0 = dr4;
+          dr1 = dr5;
+          dr2 = dr1 + w_in;
+        }
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 > h_in) {
+          switch (i + 5 - h_in) {
+            case 5:
+              din_ptr1 = zero_ptr;
+            case 4:
+              din_ptr2 = zero_ptr;
+            case 3:
+              din_ptr3 = zero_ptr;
+            case 2:
+              din_ptr4 = zero_ptr;
+            case 1:
+              din_ptr5 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = cnt_col;
+        act_switch_3x3s1p1(din_ptr0,
+                           din_ptr1,
+                           din_ptr2,
+                           din_ptr3,
+                           din_ptr4,
+                           din_ptr5,
+                           doutr0,
+                           doutr1,
+                           doutr2,
+                           doutr3,
+                           wr0,
+                           wr1,
+                           wr2,
+                           vmask,
+                           rmask,
+                           vzero,
+                           vbias,
+                           cnt,
+                           act_param);
+        dout_ptr = dout_ptr + 4 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i += 2) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr3;
+          dr3 = dr2 + w_in;
+        } else {
+          dr0 = dr2;
+          dr1 = dr3;
+          dr2 = dr1 + w_in;
+          dr3 = dr2 + w_in;
+        }
+        //! process bottom pad
+        if (i + 3 > h_in) {
+          switch (i + 3 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = cnt_col;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
+        act_switch_3x3s1p1(din_ptr0,
+                           din_ptr1,
+                           din_ptr2,
+                           din_ptr3,
+                           doutr0,
+                           doutr1,
+                           wr0,
+                           wr1,
+                           wr2,
+                           vmask_ptr,
+                           rmask_ptr,
+                           vzero,
+                           bias_val,
+                           cnt,
+                           act_param);
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
+  }
+}
+void act_switch_3x3s1p1_s(const float *din_ptr0,
+                          const float *din_ptr1,
+                          const float *din_ptr2,
+                          const float *din_ptr3,
+                          float *doutr0,
+                          float *doutr1,
+                          float32x4_t wr0,
+                          float32x4_t wr1,
+                          float32x4_t wr2,
+                          uint32x4_t vmask_rp,
+                          float32x4_t vzero,
+                          float32x4_t wbias,
+                          const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+#ifdef __aarch64__
+    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+#else
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+#endif
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [vsix] "w"(vsix),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [six_ptr] "r"(vsix),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [vscale] "w"(vscale),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [scale_ptr] "r"(vscale),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(COMPUTE_S_S1 RESULT_S_S1
+                 : [din0] "+r"(din_ptr0),
+                   [din1] "+r"(din_ptr1),
+                   [din2] "+r"(din_ptr2),
+                   [din3] "+r"(din_ptr3)
+                 : [wr0] "w"(wr0),
+                   [wr1] "w"(wr1),
+                   [wr2] "w"(wr2),
+                   [vzero] "w"(vzero),
+                   [mask] "w"(vmask_rp),
+                   [bias] "w"(wbias),
+                   [out1] "r"(doutr0),
+                   [out2] "r"(doutr1)
+                 : "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v12",
+                   "v13",
+                   "v14",
+                   "v15",
+                   "v16",
+                   "v17");
+#else
+    asm volatile(COMPUTE_S_S1 RESULT_S_S1
+                 : [din0] "+r"(din_ptr0),
+                   [din1] "+r"(din_ptr1),
+                   [din2] "+r"(din_ptr2),
+                   [din3] "+r"(din_ptr3)
+                 : [wr0] "w"(wr0),
+                   [wr1] "w"(wr1),
+                   [wr2] "w"(wr2),
+                   [vzero] "w"(vzero),
+                   [mask] "w"(vmask_rp),
+                   [bias] "w"(wbias),
+                   [out1] "r"(doutr0),
+                   [out2] "r"(doutr1)
+                 : "cc",
+                   "memory",
+                   "q6",
+                   "q7",
+                   "q8",
+                   "q9",
+                   "q10",
+                   "q11",
+                   "q12",
+                   "q13",
+                   "q14",
+                   "q15");
+#endif
+  }
+}
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width <= 4
+ */
+void conv_depthwise_3x3s1p1_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   const operators::ActivationParam act_param,
+                                   ARMContext *ctx) {
+  //! 3x3s1 convolution, implemented by direct algorithm
+  //! pad is done implicit
+  //! for 4x6 convolution window
+  const int right_pad_idx[4] = {3, 2, 1, 0};
+  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      const float *dr0 = din_channel;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+
+      for (int j = 0; j < h_out; j += 2) {
+        const float *dr0_ptr = dr0;
+        const float *dr1_ptr = dr1;
+        const float *dr2_ptr = dr2;
+        const float *dr3_ptr = dr3;
+        if (j == 0) {
+          dr0_ptr = zero;
+          dr1_ptr = dr0;
+          dr2_ptr = dr1;
+          dr3_ptr = dr2;
+          dr0 = dr1;
+          dr1 = dr2;
+        } else {
+          dr0 = dr2;
+          dr1 = dr3;
+        }
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        //! process bottom pad
+        if (j + 3 > h_in) {
+          switch (j + 3 - h_in) {
+            case 3:
+              dr1_ptr = zero;
+            case 2:
+              dr2_ptr = zero;
+            case 1:
+              dr3_ptr = zero;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (j + 2 > h_out) {
+          doutr1 = trash_buf;
+        }
+        act_switch_3x3s1p1_s(dr0_ptr,
+                             dr1_ptr,
+                             dr2_ptr,
+                             dr3_ptr,
+                             out_buf1,
+                             out_buf2,
+                             wr0,
+                             wr1,
+                             wr2,
+                             vmask_rp,
+                             vzero,
+                             wbias,
+                             act_param);
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
+        }
+        doutr0 = doutr1;
+        doutr1 += w_out;
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
+}
+
+#ifdef __aarch64__
+void act_switch_3x3s1p0(const float *din_ptr0,
+                        const float *din_ptr1,
+                        const float *din_ptr2,
+                        const float *din_ptr3,
+                        const float *din_ptr4,
+                        const float *din_ptr5,
+                        float *doutr0,
+                        float *doutr1,
+                        float *doutr2,
+                        float *doutr3,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        unsigned int *vmask,
+                        unsigned int *rmask,
+                        float32x4_t vzero,
+                        float *vbias,
+                        int cnt,
+                        int remain,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(
+            INIT_S1
+            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+            MID_COMPUTE_S1 MID_RESULT_S1_RELU
+            "cmp  %w[remain], #1             \n"
+            "blt 0f                         \n" RIGHT_COMPUTE_S1
+                RIGHT_RESULT_S1_RELU "0: \n"
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [vzero] "w"(vzero),
+              [remain] "r"(remain)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(
+            INIT_S1
+            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+            MID_COMPUTE_S1 MID_RESULT_S1_RELU6
+            "cmp  %w[remain], #1             \n"
+            "blt 0f                         \n" RIGHT_COMPUTE_S1
+                RIGHT_RESULT_S1_RELU6 "0: \n"
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [vsix] "w"(vsix),
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [remain] "r"(remain)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(
+            INIT_S1
+            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+            MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
+            "cmp  %w[remain], #1             \n"
+            "blt 0f                         \n" RIGHT_COMPUTE_S1
+                RIGHT_RESULT_S1_LEAKY_RELU "0: \n"
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [vscale] "w"(vscale),
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [remain] "r"(remain)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(
+        INIT_S1
+        "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+        "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+        "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+        "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+        "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+        "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+        MID_COMPUTE_S1 MID_RESULT_S1
+        "cmp  %w[remain], #1             \n"
+        "blt 0f                         \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+        "0: \n"
+        : [cnt] "+r"(cnt),
+          [din_ptr0] "+r"(din_ptr0),
+          [din_ptr1] "+r"(din_ptr1),
+          [din_ptr2] "+r"(din_ptr2),
+          [din_ptr3] "+r"(din_ptr3),
+          [din_ptr4] "+r"(din_ptr4),
+          [din_ptr5] "+r"(din_ptr5),
+          [doutr0] "+r"(doutr0),
+          [doutr1] "+r"(doutr1),
+          [doutr2] "+r"(doutr2),
+          [doutr3] "+r"(doutr3)
+        : [w0] "w"(wr0),
+          [w1] "w"(wr1),
+          [w2] "w"(wr2),
+          [bias_val] "r"(vbias),
+          [vmask] "r"(vmask),
+          [rmask] "r"(rmask),
+          [vzero] "w"(vzero),
+          [remain] "r"(remain)
+        : "cc",
+          "memory",
+          "v0",
+          "v1",
+          "v2",
+          "v3",
+          "v4",
+          "v5",
+          "v6",
+          "v7",
+          "v8",
+          "v9",
+          "v10",
+          "v11",
+          "v12",
+          "v13",
+          "v14",
+          "v15",
+          "v16",
+          "v17",
+          "v18",
+          "v19",
+          "v20",
+          "v21",
+          "v22",
+          "v23",
+          "v24",
+          "v25");
+  }
+}
+#else
+void act_switch_3x3s1p0(const float *din_ptr0,
+                        const float *din_ptr1,
+                        const float *din_ptr2,
+                        const float *din_ptr3,
+                        float *doutr0,
+                        float *doutr1,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        unsigned int *vmask_ptr,
+                        unsigned int *rmask_ptr,
+                        float32x4_t vzero,
+                        float bias_val,
+                        int cnt,
+                        int remain,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(INIT_S1
+                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "vext.32  q6, q8, q9, #1     @ 0012\n"
+                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                         MID_RESULT_S1_RELU
+                     "cmp  %[remain], #1             \n"
+                     "blt 0f                         \n" RIGHT_COMPUTE_S1
+                         RIGHT_RESULT_S1_RELU "0:                         \n"
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
+                       [din0_ptr] "+r"(din_ptr0),
+                       [din1_ptr] "+r"(din_ptr1),
+                       [din2_ptr] "+r"(din_ptr2),
+                       [din3_ptr] "+r"(din_ptr3),
+                       [cnt] "+r"(cnt),
+                       [rmask] "+r"(rmask_ptr),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias_val] "r"(bias_val),
+                       [vzero] "w"(vzero),
+                       [remain] "r"(remain)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(INIT_S1
+                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "vext.32  q6, q8, q9, #1     @ 0012\n"
+                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                         MID_RESULT_S1_RELU6
+                     "cmp  %[remain], #1             \n"
+                     "blt 0f                         \n" RIGHT_COMPUTE_S1
+                         RIGHT_RESULT_S1_RELU6 "0:                         \n"
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
+                       [din0_ptr] "+r"(din_ptr0),
+                       [din1_ptr] "+r"(din_ptr1),
+                       [din2_ptr] "+r"(din_ptr2),
+                       [din3_ptr] "+r"(din_ptr3),
+                       [cnt] "+r"(cnt),
+                       [rmask] "+r"(rmask_ptr),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [six_ptr] "r"(vsix),
+                       [bias_val] "r"(bias_val),
+                       [vzero] "w"(vzero),
+                       [remain] "r"(remain)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(INIT_S1
+                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "vext.32  q6, q8, q9, #1     @ 0012\n"
+                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                         MID_RESULT_S1_LEAKY_RELU
+                     "cmp  %[remain], #1             \n"
+                     "blt 0f                         \n" RIGHT_COMPUTE_S1
+                         RIGHT_RESULT_S1_LEAKY_RELU
+                     "0:                         \n"
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
+                       [din0_ptr] "+r"(din_ptr0),
+                       [din1_ptr] "+r"(din_ptr1),
+                       [din2_ptr] "+r"(din_ptr2),
+                       [din3_ptr] "+r"(din_ptr3),
+                       [cnt] "+r"(cnt),
+                       [rmask] "+r"(rmask_ptr),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [scale_ptr] "r"(vscale),
+                       [bias_val] "r"(bias_val),
+                       [vzero] "w"(vzero),
+                       [remain] "r"(remain)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(
+        INIT_S1
+        "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+        "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+        "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+        "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+        "vext.32  q6, q8, q9, #1     @ 0012\n"
+        "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1 MID_RESULT_S1
+        "cmp  %[remain], #1             \n"
+        "blt 0f                         \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+        "0:                         \n"
+        : [dout_ptr1] "+r"(doutr0),
+          [dout_ptr2] "+r"(doutr1),
+          [din0_ptr] "+r"(din_ptr0),
+          [din1_ptr] "+r"(din_ptr1),
+          [din2_ptr] "+r"(din_ptr2),
+          [din3_ptr] "+r"(din_ptr3),
+          [cnt] "+r"(cnt),
+          [rmask] "+r"(rmask_ptr),
+          [vmask] "+r"(vmask_ptr)
+        : [wr0] "w"(wr0),
+          [wr1] "w"(wr1),
+          [wr2] "w"(wr2),
+          [bias_val] "r"(bias_val),
+          [vzero] "w"(vzero),
+          [remain] "r"(remain)
+        : "cc",
+          "memory",
+          "q4",
+          "q5",
+          "q6",
+          "q7",
+          "q8",
+          "q9",
+          "q10",
+          "q11",
+          "q12",
+          "q13",
+          "q14",
+          "q15");
+  }
+}
+#endif
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width > 4
+ */
+void conv_depthwise_3x3s1p0_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 const operators::ActivationParam act_param,
+                                 ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = w_out >> 2;
+  int remain = w_out % 4;
+
+  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
+  const int remian_idx[4] = {0, 1, 2, 3};
+
+  if (remain == 0 && size_pad_right == 6) {  // w_in == w_out and w_out % 4 == 0
+    tile_w -= 1;
+    remain = 4;
+    size_pad_right = 2;
+  }
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+
+        dr0 = dr4;
+        dr1 = dr5;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 >= h_in) {
+          switch (i + 5 - h_in) {
+            case 4:
+              din_ptr1 = zero_ptr;
+            case 3:
+              din_ptr2 = zero_ptr;
+            case 2:
+              din_ptr3 = zero_ptr;
+            case 1:
+              din_ptr4 = zero_ptr;
+            case 0:
+              din_ptr5 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = tile_w;
+        act_switch_3x3s1p0(din_ptr0,
+                           din_ptr1,
+                           din_ptr2,
+                           din_ptr3,
+                           din_ptr4,
+                           din_ptr5,
+                           doutr0,
+                           doutr1,
+                           doutr2,
+                           doutr3,
+                           wr0,
+                           wr1,
+                           wr2,
+                           vmask,
+                           rmask,
+                           vzero,
+                           vbias,
+                           cnt,
+                           remain,
+                           act_param);
+        dout_ptr = dout_ptr + 4 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i += 2) {
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+
+        dr0 = dr2;
+        dr1 = dr3;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        //! process bottom pad
+        if (i + 4 > h_in) {
+          switch (i + 4 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = tile_w;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
+        act_switch_3x3s1p0(din_ptr0,
+                           din_ptr1,
+                           din_ptr2,
+                           din_ptr3,
+                           doutr0,
+                           doutr1,
+                           wr0,
+                           wr1,
+                           wr2,
+                           vmask_ptr,
+                           rmask_ptr,
+                           vzero,
+                           bias_val,
+                           cnt,
+                           remain,
+                           act_param);
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
+  }
+}
+void act_switch_3x3s1p0_s(const float *din_ptr0,
+                          const float *din_ptr1,
+                          const float *din_ptr2,
+                          const float *din_ptr3,
+                          float *doutr0,
+                          float *doutr1,
+                          float32x4_t wr0,
+                          float32x4_t wr1,
+                          float32x4_t wr2,
+                          uint32x4_t vmask_rp1,
+                          uint32x4_t vmask_rp2,
+                          float32x4_t vzero,
+                          float32x4_t wbias,
+                          unsigned int *vmask_ptr,
+                          float bias_val,
+                          const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+#ifdef __aarch64__
+    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+#else
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+#endif
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vbias] "w"(wbias),
+                       [mask1] "w"(vmask_rp1),
+                       [mask2] "w"(vmask_rp2),
+                       [vzero] "w"(vzero),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [bias_val] "r"(bias_val),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vbias] "w"(wbias),
+                       [mask1] "w"(vmask_rp1),
+                       [mask2] "w"(vmask_rp2),
+                       [vzero] "w"(vzero),
+                       [vsix] "w"(vsix),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [six_ptr] "r"(vsix),
+                       [bias_val] "r"(bias_val),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vbias] "w"(wbias),
+                       [mask1] "w"(vmask_rp1),
+                       [mask2] "w"(vmask_rp2),
+                       [vzero] "w"(vzero),
+                       [vscale] "w"(vscale),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [scale_ptr] "r"(vscale),
+                       [bias_val] "r"(bias_val),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                 : [din0] "+r"(din_ptr0),
+                   [din1] "+r"(din_ptr1),
+                   [din2] "+r"(din_ptr2),
+                   [din3] "+r"(din_ptr3)
+                 : [wr0] "w"(wr0),
+                   [wr1] "w"(wr1),
+                   [wr2] "w"(wr2),
+                   [vbias] "w"(wbias),
+                   [mask1] "w"(vmask_rp1),
+                   [mask2] "w"(vmask_rp2),
+                   [vzero] "w"(vzero),
+                   [out1] "r"(doutr0),
+                   [out2] "r"(doutr1)
+                 : "cc",
+                   "memory",
+                   "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v12",
+                   "v13",
+                   "v14",
+                   "v15");
+#else
+    asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                 : [din0] "+r"(din_ptr0),
+                   [din1] "+r"(din_ptr1),
+                   [din2] "+r"(din_ptr2),
+                   [din3] "+r"(din_ptr3),
+                   [vmask] "+r"(vmask_ptr)
+                 : [wr0] "w"(wr0),
+                   [wr1] "w"(wr1),
+                   [wr2] "w"(wr2),
+                   [vzero] "w"(vzero),
+                   [bias_val] "r"(bias_val),
+                   [out1] "r"(doutr0),
+                   [out2] "r"(doutr1)
+                 : "cc",
+                   "memory",
+                   "q4",
+                   "q5",
+                   "q6",
+                   "q7",
+                   "q8",
+                   "q9",
+                   "q10",
+                   "q11",
+                   "q12",
+                   "q13",
+                   "q14",
+                   "q15");
+#endif
+  }
+}
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width <= 4
+ */
+void conv_depthwise_3x3s1p0_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   const operators::ActivationParam act_param,
+                                   ARMContext *ctx) {
+  //! 3x3s1 convolution, implemented by direct algorithm
+  //! pad is done implicit
+  //! for 4x6 convolution window
+  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp1 =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
+  uint32x4_t vmask_rp2 =
+      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float32x4_t wbias;
+      float bias_val = 0.f;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+        bias_val = bias[i];
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      for (int j = 0; j < h_out; j += 2) {
+        const float *dr0 = din_channel + j * w_in;
+        const float *dr1 = dr0 + w_in;
+        const float *dr2 = dr1 + w_in;
+        const float *dr3 = dr2 + w_in;
+
+        doutr0 = dout_channel + j * w_out;
+        doutr1 = doutr0 + w_out;
+
+        if (j + 4 > h_in) {
+          switch (j + 4 - h_in) {
+            case 3:
+              dr1 = zero_ptr;
+            case 2:
+              dr2 = zero_ptr;
+            case 1:
+              dr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        if (j + 2 > h_out) {
+          doutr1 = trash_buf;
+        }
+        unsigned int *vmask_ptr = vmask;
+        act_switch_3x3s1p0_s(dr0,
+                             dr1,
+                             dr2,
+                             dr3,
+                             out_buf1,
+                             out_buf2,
+                             wr0,
+                             wr1,
+                             wr2,
+                             vmask_rp1,
+                             vmask_rp2,
+                             vzero,
+                             wbias,
+                             vmask_ptr,
+                             bias_val,
+                             act_param);
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
+        }
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
+}
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55ea94949ba93396c97be5e3ea66d6e29ce95429
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
@@ -0,0 +1,1043 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+// clang-format off
+#ifdef __aarch64__
+#define COMPUTE \
+          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/ \
+          "ldp    q6, q7,   [%[inr1]], #32\n" /* load input r1*/ \
+          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/ \
+          "ldp    q8, q9,   [%[inr1]], #32\n" /* load input r1*/ \
+          "ldp    q4, q5,   [%[inr0]]\n"      /* load input r0*/ \
+          "ldp    q10, q11, [%[inr1]]\n"      /* load input r1*/ \
+          /*  r0, r1, mul w0, get out r0, r1 */ \
+          "fmul   v15.4s ,  %[w0].4s,  v0.4s\n" /* outr00 = w0 * r0, 0*/ \
+          "fmul   v16.4s ,  %[w0].4s,  v1.4s\n" /* outr01 = w0 * r0, 1*/ \
+          "fmul   v17.4s ,  %[w0].4s,  v2.4s\n" /* outr02 = w0 * r0, 2*/ \
+          "fmul   v18.4s ,  %[w0].4s,  v3.4s\n" /* outr03 = w0 * r0, 3*/ \
+          "fmul   v19.4s ,  %[w0].4s,  v6.4s\n" /* outr10 = w0 * r1, 0*/ \
+          "fmul   v20.4s ,  %[w0].4s,  v7.4s\n" /* outr11 = w0 * r1, 1*/ \
+          "fmul   v21.4s ,  %[w0].4s,  v8.4s\n" /* outr12 = w0 * r1, 2*/ \
+          "fmul   v22.4s ,  %[w0].4s,  v9.4s\n" /* outr13 = w0 * r1, 3*/ \
+          /*  r0, r1, mul w1, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w1].4s,  v1.4s\n" /* outr00 = w1 * r0[1]*/ \
+          "ldp    q0, q1,   [%[inr2]], #32\n"     /* load input r2*/ \
+          "fmla   v16.4s ,  %[w1].4s,  v2.4s\n" /* outr01 = w1 * r0[2]*/ \
+          "fmla   v17.4s ,  %[w1].4s,  v3.4s\n" /* outr02 = w1 * r0[3]*/ \
+          "fmla   v18.4s ,  %[w1].4s,  v4.4s\n" /* outr03 = w1 * r0[4]*/ \
+          "fmla   v19.4s ,  %[w1].4s,  v7.4s\n" /* outr10 = w1 * r1[1]*/ \
+          "fmla   v20.4s ,  %[w1].4s,  v8.4s\n" /* outr11 = w1 * r1[2]*/ \
+          "fmla   v21.4s ,  %[w1].4s,  v9.4s\n" /* outr12 = w1 * r1[3]*/ \
+          "fmla   v22.4s ,  %[w1].4s,  v10.4s\n"/* outr13 = w1 * r1[4]*/ \
+          /*  r0, r1, mul w2, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w2].4s,  v2.4s\n" /* outr00 = w2 * r0[2]*/ \
+          "fmla   v16.4s ,  %[w2].4s,  v3.4s\n" /* outr01 = w2 * r0[3]*/ \
+          "ldp    q2, q3,   [%[inr2]], #32\n"     /* load input r2*/ \
+          "fmla   v17.4s ,  %[w2].4s,  v4.4s\n" /* outr02 = w2 * r0[4]*/ \
+          "fmla   v18.4s ,  %[w2].4s,  v5.4s\n" /* outr03 = w2 * r0[5]*/ \
+          "ldp    q4, q5,   [%[inr2]]\n"          /* load input r2*/ \
+          "fmla   v19.4s ,  %[w2].4s,  v8.4s\n" /* outr10 = w2 * r1[2]*/ \
+          "fmla   v20.4s ,  %[w2].4s,  v9.4s\n" /* outr11 = w2 * r1[3]*/ \
+          "fmla   v21.4s ,  %[w2].4s,  v10.4s\n"/* outr12 = w2 * r1[4]*/ \
+          "fmla   v22.4s ,  %[w2].4s,  v11.4s\n"/* outr13 = w2 * r1[5]*/ \
+          /*  r1, r2, mul w3, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w3].4s,  v6.4s\n" /* outr00 = w3 * r1[0]*/ \
+          "fmla   v16.4s ,  %[w3].4s,  v7.4s\n" /* outr01 = w3 * r1[1]*/ \
+          "fmla   v17.4s ,  %[w3].4s,  v8.4s\n" /* outr02 = w3 * r1[2]*/ \
+          "fmla   v18.4s ,  %[w3].4s,  v9.4s\n" /* outr03 = w3 * r1[3]*/ \
+          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr10 = w3 * r2[0]*/ \
+          "fmla   v20.4s ,  %[w3].4s,  v1.4s\n" /* outr11 = w3 * r2[1]*/ \
+          "fmla   v21.4s ,  %[w3].4s,  v2.4s\n" /* outr12 = w3 * r2[2]*/ \
+          "fmla   v22.4s ,  %[w3].4s,  v3.4s\n" /* outr13 = w3 * r2[3]*/ \
+          /*  r1, r2, mul w4, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w4].4s,  v7.4s\n" /* outr00 = w4 * r1[1]*/ \
+          "ldp    q6, q7,   [%[inr3]], #32\n"     /* load input r3*/ \
+          "fmla   v16.4s ,  %[w4].4s,  v8.4s\n" /* outr01 = w4 * r1[2]*/ \
+          "fmla   v17.4s ,  %[w4].4s,  v9.4s\n" /* outr02 = w4 * r1[3]*/ \
+          "fmla   v18.4s ,  %[w4].4s,  v10.4s\n"/* outr03 = w4 * r1[4]*/ \
+          "ldp    x0, x1, [%[outl]]  \n" \
+          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr10 = w4 * r2[1]*/ \
+          "fmla   v20.4s ,  %[w4].4s,  v2.4s\n" /* outr11 = w4 * r2[2]*/ \
+          "fmla   v21.4s ,  %[w4].4s,  v3.4s\n" /* outr12 = w4 * r2[3]*/ \
+          "fmla   v22.4s ,  %[w4].4s,  v4.4s\n" /* outr13 = w4 * r2[4]*/ \
+          /*  r1, r2, mul w5, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w5].4s,  v8.4s\n" /* outr00 = w5 * r1[2]*/ \
+          "fmla   v16.4s ,  %[w5].4s,  v9.4s\n" /* outr01 = w5 * r1[3]*/ \
+          "ldp    q8, q9,   [%[inr3]], #32\n"     /* load input r3*/ \
+          "fmla   v17.4s ,  %[w5].4s,  v10.4s\n"/* outr02 = w5 * r1[4]*/ \
+          "fmla   v18.4s ,  %[w5].4s,  v11.4s\n"/* outr03 = w5 * r1[5]*/ \
+          "ldp    q10, q11,   [%[inr3]]\n"        /* load input r3*/ \
+          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr10 = w5 * r2[2]*/ \
+          "fmla   v20.4s ,  %[w5].4s,  v3.4s\n" /* outr11 = w5 * r2[3]*/ \
+          "fmla   v21.4s ,  %[w5].4s,  v4.4s\n" /* outr12 = w5 * r2[4]*/ \
+          "fmla   v22.4s ,  %[w5].4s,  v5.4s\n" /* outr13 = w5 * r2[5]*/ \
+          /*  r2, r3, mul w6, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w6].4s,  v0.4s\n" /* outr00 = w6 * r2[0]*/ \
+          "fmla   v16.4s ,  %[w6].4s,  v1.4s\n" /* outr01 = w6 * r2[1]*/ \
+          "fmla   v17.4s ,  %[w6].4s,  v2.4s\n" /* outr02 = w6 * r2[2]*/ \
+          "fmla   v18.4s ,  %[w6].4s,  v3.4s\n" /* outr03 = w6 * r2[3]*/ \
+          "ldp    x2, x3, [%[outl], #16]  \n" \
+          "fmla   v19.4s ,  %[w6].4s,  v6.4s\n" /* outr10 = w6 * r3[0]*/ \
+          "fmla   v20.4s ,  %[w6].4s,  v7.4s\n" /* outr11 = w6 * r3[1]*/ \
+          "fmla   v21.4s ,  %[w6].4s,  v8.4s\n" /* outr12 = w6 * r3[2]*/ \
+          "fmla   v22.4s ,  %[w6].4s,  v9.4s\n" /* outr13 = w6 * r3[3]*/ \
+          /*  r2, r3, mul w7, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w7].4s,  v1.4s\n" /* outr00 = w7 * r2[1]*/ \
+          "fmla   v16.4s ,  %[w7].4s,  v2.4s\n" /* outr01 = w7 * r2[2]*/ \
+          "fmla   v17.4s ,  %[w7].4s,  v3.4s\n" /* outr02 = w7 * r2[3]*/ \
+          "fmla   v18.4s ,  %[w7].4s,  v4.4s\n" /* outr03 = w7 * r2[4]*/ \
+          "ldp    x4, x5, [%[outl], #32]  \n" \
+          "fmla   v19.4s ,  %[w7].4s,  v7.4s\n" /* outr10 = w7 * r3[1]*/ \
+          "fmla   v20.4s ,  %[w7].4s,  v8.4s\n" /* outr11 = w7 * r3[2]*/ \
+          "fmla   v21.4s ,  %[w7].4s,  v9.4s\n" /* outr12 = w7 * r3[3]*/ \
+          "fmla   v22.4s ,  %[w7].4s,  v10.4s\n"/* outr13 = w7 * r3[4]*/ \
+          /*  r2, r3, mul w8, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w8].4s,  v2.4s\n" /* outr00 = w8 * r2[2]*/ \
+          "fmla   v16.4s ,  %[w8].4s,  v3.4s\n" /* outr01 = w8 * r2[3]*/ \
+          "fmla   v17.4s ,  %[w8].4s,  v4.4s\n" /* outr02 = w8 * r2[0]*/ \
+          "fmla   v18.4s ,  %[w8].4s,  v5.4s\n" /* outr03 = w8 * r2[1]*/ \
+          "ldp    x6, x7, [%[outl], #48]  \n" \
+          "fmla   v19.4s ,  %[w8].4s,  v8.4s\n" /* outr10 = w8 * r3[2]*/ \
+          "fmla   v20.4s ,  %[w8].4s,  v9.4s\n" /* outr11 = w8 * r3[3]*/ \
+          "fmla   v21.4s ,  %[w8].4s,  v10.4s\n"/* outr12 = w8 * r3[0]*/ \
+          "fmla   v22.4s ,  %[w8].4s,  v11.4s\n"/* outr13 = w8 * r3[1]*/ \
+          \
+          "fadd   v15.4s, v15.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v16.4s, v16.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v17.4s, v17.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v18.4s, v18.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v19.4s, v19.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v20.4s, v20.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v21.4s, v21.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v22.4s, v22.4s, %[vbias].4s\n"/* add bias */ \
+          /* transpose */ \
+          "trn1   v0.4s, v15.4s, v16.4s\n" /* r0: a0a1c0c1*/ \
+          "trn2   v1.4s, v15.4s, v16.4s\n" /* r0: b0b1d0d1*/ \
+          "trn1   v2.4s, v17.4s, v18.4s\n" /* r0: a2a3c2c3*/ \
+          "trn2   v3.4s, v17.4s, v18.4s\n" /* r0: b2b3d2d3*/ \
+          "trn1   v4.4s, v19.4s, v20.4s\n" /* r1: a0a1c0c1*/ \
+          "trn2   v5.4s, v19.4s, v20.4s\n" /* r1: b0b1d0d1*/ \
+          "trn1   v6.4s, v21.4s, v22.4s\n" /* r1: a2a3c2c3*/ \
+          "trn2   v7.4s, v21.4s, v22.4s\n" /* r1: b2b3d2d3*/ \
+          "trn1   v15.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/ \
+          "trn2   v19.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/ \
+          "trn1   v17.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/ \
+          "trn2   v21.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/ \
+          "trn1   v16.2d, v4.2d, v6.2d\n"  /* r1: a0a1a2a3*/ \
+          "trn2   v20.2d, v4.2d, v6.2d\n"  /* r1: c0c1c2c3*/ \
+          "trn1   v18.2d, v5.2d, v7.2d\n"  /* r1: b0b1b2b3*/ \
+          "trn2   v22.2d, v5.2d, v7.2d\n"  /* r1: d0d1d2d3*/
+
+#define RELU \
+          "movi   v0.4s, #0\n"             /* for relu */ \
+          "ldr x0,    [%[outl], #80]\n" \
+          "fmax   v15.4s, v15.4s, v0.4s\n" \
+          "fmax   v16.4s, v16.4s, v0.4s\n" \
+          "fmax   v17.4s, v17.4s, v0.4s\n" \
+          "fmax   v18.4s, v18.4s, v0.4s\n" \
+          "ld1 {v1.4s}, [x0]\n" \
+          "fmax   v19.4s, v19.4s, v0.4s\n" \
+          "fmax   v20.4s, v20.4s, v0.4s\n" \
+          "fmax   v21.4s, v21.4s, v0.4s\n" \
+          "fmax   v22.4s, v22.4s, v0.4s\n" \
+          "ldr x0,    [%[outl]]\n" \
+
+#define RELU6 \
+          "fmin   v15.4s, v15.4s, v1.4s\n" \
+          "fmin   v16.4s, v16.4s, v1.4s\n" \
+          "fmin   v17.4s, v17.4s, v1.4s\n" \
+          "fmin   v18.4s, v18.4s, v1.4s\n" \
+          "fmin   v19.4s, v19.4s, v1.4s\n" \
+          "fmin   v20.4s, v20.4s, v1.4s\n" \
+          "fmin   v21.4s, v21.4s, v1.4s\n" \
+          "fmin   v22.4s, v22.4s, v1.4s\n"
+
+#define LEAKY_RELU \
+          "movi   v0.4s, #0\n"             /* for relu */ \
+          "ldr x0,    [%[outl], #88]\n" \
+          "fcmge v1.4s, v15.4s,  v0.4s \n" /* vcgeq_f32 */ \
+          "fcmge v2.4s, v16.4s,  v0.4s \n" /* vcgeq_f32 */ \
+          "ld1 {v9.4s}, [x0] \n" \
+          "fcmge v3.4s, v17.4s,  v0.4s \n" /* vcgeq_f32 */ \
+          "fcmge v4.4s, v18.4s,  v0.4s \n" /* vcgeq_f32 */ \
+          "ldr x0,    [%[outl]] \n" \
+          "fmul v5.4s, v15.4s, v9.4s \n" /* mul */ \
+          "fmul v6.4s, v16.4s, v9.4s \n" /* mul */ \
+          "fmul v7.4s, v17.4s, v9.4s \n" /* mul */ \
+          "fmul v8.4s, v18.4s, v9.4s \n" /* mul */ \
+          "bif  v15.16b, v5.16b, v1.16b \n" /* choose*/ \
+          "bif  v16.16b, v6.16b, v2.16b \n" /* choose*/ \
+          "bif  v17.16b, v7.16b, v3.16b \n" /* choose*/ \
+          "bif  v18.16b, v8.16b, v4.16b \n" /* choose*/ \
+          "fcmge v1.4s, v19.4s,  v0.4s \n" /* vcgeq_f32 */ \
+          "fcmge v2.4s, v20.4s,  v0.4s \n" /* vcgeq_f32 */ \
+          "fcmge v3.4s, v21.4s,  v0.4s \n" /* vcgeq_f32 */ \
+          "fcmge v4.4s, v22.4s,  v0.4s \n" /* vcgeq_f32 */ \
+          "fmul v5.4s, v19.4s, v9.4s \n" /* mul */ \
+          "fmul v6.4s, v20.4s, v9.4s \n" /* mul */ \
+          "fmul v7.4s, v21.4s, v9.4s \n" /* mul */ \
+          "fmul v8.4s, v22.4s, v9.4s \n" /* mul */ \
+          "bif  v19.16b, v5.16b, v1.16b \n" /* choose*/ \
+          "bif  v20.16b, v6.16b, v2.16b \n" /* choose*/ \
+          "bif  v21.16b, v7.16b, v3.16b \n" /* choose*/ \
+          "bif  v22.16b, v8.16b, v4.16b \n" /* choose*/
+
+#define STORE \
+          "cbnz   %w[flag_mask], 1f\n" \
+          "str    q15, [x0]\n" /* save outc00 */ \
+          "str    q16, [x4]\n" /* save outc01 */ \
+          "str    q17, [x1]\n" /* save outc10 */ \
+          "str    q18, [x5]\n" /* save outc11 */ \
+          "str    q19, [x2]\n" /* save outc20 */ \
+          "str    q20, [x6]\n" /* save outc21 */ \
+          "str    q21, [x3]\n" /* save outc30 */ \
+          "str    q22, [x7]\n" /* save outc31 */ \
+          "b 2f\n" \
+          "1:\n" \
+          "str  q15, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q17, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q19, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q21, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q16, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q18, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q20, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q22, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "2:\n"
+#else
+#define COMPUTE \
+          /* load weights */ \
+          "vld1.32    {d10-d13}, [%[wc0]]!      @ load w0, w1, to q5, q6\n" \
+          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w2, to q7\n" \
+          /* load r0, r1 */ \
+          "vld1.32    {d0-d3}, [%[r0]]!         @ load r0, q0, q1\n" \
+          "vld1.32    {d4-d7}, [%[r0]]!         @ load r0, q2, q3\n" \
+          /* main loop */ \
+          "0:                                   @ main loop\n" \
+          /* mul r0 with w0, w1, w2, get out r0 */ \
+          "vmul.f32   q8, q5, q0                @ w0 * inr00\n" \
+          "vmul.f32   q9, q5, q1                @ w0 * inr01\n" \
+          "vmul.f32   q10, q5, q2               @ w0 * inr02\n" \
+          "vmul.f32   q11, q5, q3               @ w0 * inr03\n" \
+          "vmla.f32   q8, q6, q1                @ w1 * inr01\n" \
+          "vld1.32    {d0-d3}, [%[r0]]          @ load r0, q0, q1\n" \
+          "vmla.f32   q9, q6, q2                @ w1 * inr02\n" \
+          "vmla.f32   q10, q6, q3               @ w1 * inr03\n" \
+          "vmla.f32   q11, q6, q0               @ w1 * inr04\n" \
+          "vmla.f32   q8, q7, q2                @ w2 * inr02\n" \
+          "vmla.f32   q9, q7, q3                @ w2 * inr03\n" \
+          "vld1.32    {d4-d7}, [%[r1]]!         @ load r0, q2, q3\n" \
+          "vmla.f32   q10, q7, q0               @ w2 * inr04\n" \
+          "vmla.f32   q11, q7, q1               @ w2 * inr05\n" \
+          "vld1.32    {d0-d3}, [%[r1]]!         @ load r0, q0, q1\n" \
+          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w3 to q4\n" \
+          /* mul r1 with w0-w5, get out r0, r1 */ \
+          "vmul.f32   q12, q5, q2               @ w0 * inr10\n" \
+          "vmul.f32   q13, q5, q3               @ w0 * inr11\n" \
+          "vmul.f32   q14, q5, q0               @ w0 * inr12\n" \
+          "vmul.f32   q15, q5, q1               @ w0 * inr13\n" \
+          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w4 to q5\n" \
+          "vmla.f32   q8, q4, q2                @ w3 * inr10\n" \
+          "vmla.f32   q9, q4, q3                @ w3 * inr11\n" \
+          "vmla.f32   q10, q4, q0               @ w3 * inr12\n" \
+          "vmla.f32   q11, q4, q1               @ w3 * inr13\n" \
+          /* mul r1 with w1, w4, get out r1, r0 */ \
+          "vmla.f32   q8, q5, q3                @ w4 * inr11\n" \
+          "vmla.f32   q12, q6, q3               @ w1 * inr11\n" \
+          "vld1.32    {d4-d7}, [%[r1]]          @ load r1, q2, q3\n" \
+          "vmla.f32   q9, q5, q0                @ w4 * inr12\n" \
+          "vmla.f32   q13, q6, q0               @ w1 * inr12\n" \
+          "vmla.f32   q10, q5, q1               @ w4 * inr13\n" \
+          "vmla.f32   q14, q6, q1               @ w1 * inr13\n" \
+          "vmla.f32   q11, q5, q2               @ w4 * inr14\n" \
+          "vmla.f32   q15, q6, q2               @ w1 * inr14\n" \
+          "vld1.32    {d12-d13}, [%[wc0]]!      @ load w5 to q6\n" \
+          /* mul r1 with w2, w5, get out r1, r0 */ \
+          "vmla.f32   q12, q7, q0               @ w2 * inr12\n" \
+          "vmla.f32   q13, q7, q1               @ w2 * inr13\n" \
+          "vmla.f32   q8, q6, q0                @ w5 * inr12\n" \
+          "vmla.f32   q9, q6, q1                @ w5 * inr13\n" \
+          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, q0, q1\n" \
+          "vmla.f32   q14, q7, q2               @ w2 * inr14\n" \
+          "vmla.f32   q15, q7, q3               @ w2 * inr15\n" \
+          "vmla.f32   q10, q6, q2               @ w5 * inr14\n" \
+          "vmla.f32   q11, q6, q3               @ w5 * inr15\n" \
+          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, q0, q1\n" \
+          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w6, to q7\n" \
+          /* mul r2 with w3-w8, get out r0, r1 */ \
+          "vmla.f32   q12, q4, q0               @ w3 * inr20\n" \
+          "vmla.f32   q13, q4, q1               @ w3 * inr21\n" \
+          "vmla.f32   q14, q4, q2               @ w3 * inr22\n" \
+          "vmla.f32   q15, q4, q3               @ w3 * inr23\n" \
+          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w7, to q4\n" \
+          "vmla.f32   q8,  q7, q0               @ w6 * inr20\n" \
+          "vmla.f32   q9,  q7, q1               @ w6 * inr21\n" \
+          "vmla.f32   q10, q7, q2               @ w6 * inr22\n" \
+          "vmla.f32   q11, q7, q3               @ w6 * inr23\n" \
+          /* mul r2 with w4, w7, get out r1, r0 */ \
+          "vmla.f32   q8,  q4, q1               @ w7 * inr21\n" \
+          "vmla.f32   q12, q5, q1               @ w4 * inr21\n" \
+          "vld1.32    {d0-d3}, [%[r2]]          @ load r2, q0, q1\n" \
+          "vmla.f32   q9,  q4, q2               @ w7 * inr22\n" \
+          "vmla.f32   q13, q5, q2               @ w4 * inr22\n" \
+          "vmla.f32   q10, q4, q3               @ w7 * inr23\n" \
+          "vmla.f32   q14, q5, q3               @ w4 * inr23\n" \
+          "vmla.f32   q11, q4, q0               @ w7 * inr24\n" \
+          "vmla.f32   q15, q5, q0               @ w4 * inr24\n" \
+          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w8 to q5\n" \
+          /* mul r1 with w5, w8, get out r1, r0 */ \
+          "vmla.f32   q12, q6, q2               @ w5 * inr22\n" \
+          "vmla.f32   q13, q6, q3               @ w5 * inr23\n" \
+          "vmla.f32   q8,  q5, q2               @ w8 * inr22\n" \
+          "vmla.f32   q9,  q5, q3               @ w8 * inr23\n" \
+          "vld1.32    {d4-d7}, [%[r3]]!         @ load r3, q2, q3\n" \
+          "ldr r4,    [%[outl], #32]            @ load bias addr to r4\n" \
+          "vmla.f32   q14, q6, q0               @ w5 * inr24\n" \
+          "vmla.f32   q15, q6, q1               @ w5 * inr25\n" \
+          "vmla.f32   q10, q5, q0               @ w8 * inr24\n" \
+          "vmla.f32   q11, q5, q1               @ w8 * inr25\n" \
+          "vld1.32    {d0-d3}, [%[r3]]!         @ load r3, q0, q1\n" \
+          "sub %[wc0], %[wc0], #144      @ wc0 - 144 to start address\n" \
+          /* mul r3 with w6, w7, w8, get out r1 */ \
+          "vmla.f32   q12, q7, q2               @ w6 * inr30\n" \
+          "vmla.f32   q13, q7, q3               @ w6 * inr31\n" \
+          "vmla.f32   q14, q7, q0               @ w6 * inr32\n" \
+          "vmla.f32   q15, q7, q1               @ w6 * inr33\n" \
+          "vmla.f32   q12, q4, q3               @ w7 * inr31\n" \
+          "vld1.32    {d4-d7}, [%[r3]]          @ load r3, q2, q3\n" \
+          "vld1.32    {d12-d13}, [r4]           @ load bias\n" \
+          "vmla.f32   q13, q4, q0               @ w7 * inr32\n" \
+          "vmla.f32   q14, q4, q1               @ w7 * inr33\n" \
+          "vmla.f32   q15, q4, q2               @ w7 * inr34\n" \
+          "ldr r0,    [%[outl]]                 @ load outc00 to r0\n" \
+          "vmla.f32   q12, q5, q0               @ w8 * inr32\n" \
+          "vmla.f32   q13, q5, q1               @ w8 * inr33\n" \
+          "ldr r5,    [%[outl], #36]            @ load flag_relu to r5\n" \
+          "vmla.f32   q14, q5, q2               @ w8 * inr34\n" \
+          "vmla.f32   q15, q5, q3               @ w8 * inr35\n" \
+          "ldr r1,    [%[outl], #4]             @ load outc10 to r1\n" \
+          "vadd.f32   q8, q8, q6                @ r00 add bias\n" \
+          "vadd.f32   q9, q9, q6                @ r01 add bias\n" \
+          "vadd.f32   q10, q10, q6              @ r02 add bias\n" \
+          "vadd.f32   q11, q11, q6              @ r03 add bias\n" \
+          "ldr r2,    [%[outl], #8]             @ load outc20 to r2\n" \
+          "vadd.f32   q12, q12, q6              @ r10 add bias\n" \
+          "vadd.f32   q13, q13, q6              @ r11 add bias\n" \
+          "vadd.f32   q14, q14, q6              @ r12 add bias\n" \
+          "vadd.f32   q15, q15, q6              @ r13 add bias\n" \
+          "ldr r3,    [%[outl], #12]            @ load outc30 to r3\n" \
+          "vmov.u32   q7, #0                    @ mov zero to q7\n"
+#define RELU \
+          "vmax.f32  q8, q8, q7                 @ r00 relu\n" \
+          "vmax.f32  q9, q9, q7                 @ r01 relu\n" \
+          "vmax.f32  q10, q10, q7               @ r02 relu\n" \
+          "vmax.f32  q11, q11, q7               @ r03 relu\n" \
+          "vmax.f32  q12, q12, q7               @ r10 relu\n" \
+          "vmax.f32  q13, q13, q7               @ r11 relu\n" \
+          "vmax.f32  q14, q14, q7               @ r12 relu\n" \
+          "vmax.f32  q15, q15, q7               @ r13 relu\n"
+
+#define RELU6 \
+          "ldr r4,    [%[outl], #40]            @ load six to r4\n" \
+          "vld1.32 {d12-d13}, [r4] @load data \n" \
+          "vmin.f32  q8, q8, q6                 @ r00 relu\n" \
+          "vmin.f32  q9, q9, q6                 @ r01 relu\n" \
+          "vmin.f32  q10, q10, q6               @ r02 relu\n" \
+          "vmin.f32  q11, q11, q6               @ r03 relu\n" \
+          "vmin.f32  q12, q12, q6               @ r10 relu\n" \
+          "vmin.f32  q13, q13, q6               @ r11 relu\n" \
+          "vmin.f32  q14, q14, q6               @ r12 relu\n" \
+          "vmin.f32  q15, q15, q6               @ r13 relu\n"
+
+#define LEAKY_RELU \
+          "ldr r4,    [%[outl], #44]            @ load scale to r4\n" \
+          "vld1.32 {d12-d13}, [r4] @load data \n" \
+          "vcge.f32 q0, q8, q7        @ q0 > 0 \n"   \
+          "vcge.f32 q1, q9, q7        @ q0 > 0 \n"   \
+          "vmul.f32 q4, q8, q6 \n"  \
+          "vmul.f32 q5, q9, q6 \n"  \
+          "vcge.f32 q2, q10, q7        @ q0 > 0 \n"   \
+          "vcge.f32 q3, q11, q7        @ q0 > 0 \n"   \
+          "vbif q8, q4, q0 @ choose \n" \
+          "vbif q9, q5, q1 @ choose \n" \
+          "vmul.f32 q4, q10, q6 \n"  \
+          "vmul.f32 q5, q11, q6 \n"  \
+          "vbif q10, q4, q2 @ choose \n" \
+          "vbif q11, q5, q3 @ choose \n" \
+          "vcge.f32 q0, q12, q7        @ q0 > 0 \n"   \
+          "vcge.f32 q1, q13, q7        @ q0 > 0 \n"   \
+          "vmul.f32 q4, q12, q6 \n"  \
+          "vmul.f32 q5, q13, q6 \n"  \
+          "vcge.f32 q2, q14, q7        @ q0 > 0 \n"   \
+          "vcge.f32 q3, q15, q7        @ q0 > 0 \n"   \
+          "vbif q12, q4, q0 @ choose \n" \
+          "vbif q13, q5, q1 @ choose \n" \
+          "vmul.f32 q4, q14, q6 \n"  \
+          "vmul.f32 q5, q15, q6 \n"  \
+          "vbif q14, q4, q2 @ choose \n" \
+          "vbif q15, q5, q3 @ choose \n"
+
+#define STORE \
+          "ldr r4,   [%[outl], #16]   @ load outc01 to r4\n" \
+          "vtrn.32   q8, q9           @ r0: q8 : a0a1c0c1, q9 : b0b1d0d1\n" \
+          "vtrn.32   q10, q11         @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n" \
+          "vtrn.32   q12, q13         @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n" \
+          "vtrn.32   q14, q15         @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n" \
+          "ldr r5,   [%[outl], #20]   @ load outc11 to r5\n" \
+          "vswp      d17, d20         @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n" \
+          "vswp      d19, d22         @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n" \
+          "vswp      d25, d28         @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n" \
+          "vswp      d27, d30         @ r1: q13: b0b1b2b3, q15: d0d1d2d3 \n" \
+          "cmp %[flag_mask], #0       @ cmp flag mask\n" \
+          "bne 2f\n" \
+          "vst1.32   {d16-d17}, [r0]  @ save outc00\n" \
+          "vst1.32   {d18-d19}, [r1]  @ save outc10\n" \
+          "vst1.32   {d20-d21}, [r2]  @ save outc20\n" \
+          "vst1.32   {d22-d23}, [r3]  @ save outc30\n" \
+          "vst1.32   {d24-d25}, [r4]  @ save outc01\n" \
+          "vst1.32   {d26-d27}, [r5]  @ save outc11\n" \
+          "ldr r0,   [%[outl], #24]   @ load outc21 to r0\n" \
+          "ldr r1,   [%[outl], #28]   @ load outc31 to r1\n" \
+          "vst1.32   {d28-d29}, [r0]  @ save outc21\n" \
+          "vst1.32   {d30-d31}, [r1]  @ save outc31\n" \
+          "b 3f                       @ branch end\n" \
+          "2: \n" \
+          "vst1.32 {d16-d17}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d18-d19}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d20-d21}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d22-d23}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d24-d25}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d26-d27}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d28-d29}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d30-d31}, [%[out0]]!  @ save remain to pre_out\n" \
+          "3: \n"
+#endif
+// clang-format on
+void act_switch_3x3s1(const float* inr0,
+                      const float* inr1,
+                      const float* inr2,
+                      const float* inr3,
+                      float* out0,
+                      const float* weight_c,
+                      float flag_mask,
+                      void* outl_ptr,
+                      float32x4_t w0,
+                      float32x4_t w1,
+                      float32x4_t w2,
+                      float32x4_t w3,
+                      float32x4_t w4,
+                      float32x4_t w5,
+                      float32x4_t w6,
+                      float32x4_t w7,
+                      float32x4_t w8,
+                      float32x4_t vbias,
+                      const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE RELU STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [inr3] "+r"(inr3),
+                       [out] "+r"(out0)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [vbias] "w"(vbias),
+                       [outl] "r"(outl_ptr),
+                       [flag_mask] "r"(flag_mask)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22",
+                       "x0",
+                       "x1",
+                       "x2",
+                       "x3",
+                       "x4",
+                       "x5",
+                       "x6",
+                       "x7");
+#else
+        asm volatile(COMPUTE RELU STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [r3] "+r"(inr3),
+                       [out0] "+r"(out0),
+                       [wc0] "+r"(weight_c)
+                     : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15",
+                       "r0",
+                       "r1",
+                       "r2",
+                       "r3",
+                       "r4",
+                       "r5");
+#endif
+        break;
+      case lite_api::ActivationType::kRelu6:
+#ifdef __aarch64__
+        asm volatile(COMPUTE RELU RELU6 STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [inr3] "+r"(inr3),
+                       [out] "+r"(out0)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [vbias] "w"(vbias),
+                       [outl] "r"(outl_ptr),
+                       [flag_mask] "r"(flag_mask)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22",
+                       "x0",
+                       "x1",
+                       "x2",
+                       "x3",
+                       "x4",
+                       "x5",
+                       "x6",
+                       "x7");
+#else
+        asm volatile(COMPUTE RELU RELU6 STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [r3] "+r"(inr3),
+                       [out0] "+r"(out0),
+                       [wc0] "+r"(weight_c)
+                     : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15",
+                       "r0",
+                       "r1",
+                       "r2",
+                       "r3",
+                       "r4",
+                       "r5");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE LEAKY_RELU STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [inr3] "+r"(inr3),
+                       [out] "+r"(out0)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [vbias] "w"(vbias),
+                       [outl] "r"(outl_ptr),
+                       [flag_mask] "r"(flag_mask)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22",
+                       "x0",
+                       "x1",
+                       "x2",
+                       "x3",
+                       "x4",
+                       "x5",
+                       "x6",
+                       "x7");
+#else
+        asm volatile(COMPUTE LEAKY_RELU STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [r3] "+r"(inr3),
+                       [out0] "+r"(out0),
+                       [wc0] "+r"(weight_c)
+                     : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15",
+                       "r0",
+                       "r1",
+                       "r2",
+                       "r3",
+                       "r4",
+                       "r5");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(COMPUTE STORE
+                 : [inr0] "+r"(inr0),
+                   [inr1] "+r"(inr1),
+                   [inr2] "+r"(inr2),
+                   [inr3] "+r"(inr3),
+                   [out] "+r"(out0)
+                 : [w0] "w"(w0),
+                   [w1] "w"(w1),
+                   [w2] "w"(w2),
+                   [w3] "w"(w3),
+                   [w4] "w"(w4),
+                   [w5] "w"(w5),
+                   [w6] "w"(w6),
+                   [w7] "w"(w7),
+                   [w8] "w"(w8),
+                   [vbias] "w"(vbias),
+                   [outl] "r"(outl_ptr),
+                   [flag_mask] "r"(flag_mask)
+                 : "cc",
+                   "memory",
+                   "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v15",
+                   "v16",
+                   "v17",
+                   "v18",
+                   "v19",
+                   "v20",
+                   "v21",
+                   "v22",
+                   "x0",
+                   "x1",
+                   "x2",
+                   "x3",
+                   "x4",
+                   "x5",
+                   "x6",
+                   "x7");
+#else
+    asm volatile(COMPUTE STORE
+                 : [r0] "+r"(inr0),
+                   [r1] "+r"(inr1),
+                   [r2] "+r"(inr2),
+                   [r3] "+r"(inr3),
+                   [out0] "+r"(out0),
+                   [wc0] "+r"(weight_c)
+                 : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                 : "cc",
+                   "memory",
+                   "q0",
+                   "q1",
+                   "q2",
+                   "q3",
+                   "q4",
+                   "q5",
+                   "q6",
+                   "q7",
+                   "q8",
+                   "q9",
+                   "q10",
+                   "q11",
+                   "q12",
+                   "q13",
+                   "q14",
+                   "q15",
+                   "r0",
+                   "r1",
+                   "r2",
+                   "r3",
+                   "r4",
+                   "r5");
+#endif
+  }
+}
+void conv_3x3s1_depthwise_fp32(const float* i_data,
+                               float* o_data,
+                               int bs,
+                               int oc,
+                               int oh,
+                               int ow,
+                               int ic,
+                               int ih,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
+                               ARMContext* ctx) {
+  int threads = ctx->threads();
+
+  auto paddings = *param.paddings;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
+
+  const int out_c_block = 4;
+  const int out_h_kernel = 2;
+  const int out_w_kernel = 4;
+  const int win_ext = ow + 2;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh + 2;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size =
+      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  float* ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  float six_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+  float scale_ptr[4] = {1.f, 1.f, 1.f, 1.f};
+  float relu_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        break;
+      case lite_api::ActivationType::kRelu6:
+        six_ptr[0] = act_param.Relu_clipped_coef;
+        six_ptr[1] = act_param.Relu_clipped_coef;
+        six_ptr[2] = act_param.Relu_clipped_coef;
+        six_ptr[3] = act_param.Relu_clipped_coef;
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        scale_ptr[0] = act_param.Leaky_relu_alpha;
+        scale_ptr[1] = act_param.Leaky_relu_alpha;
+        scale_ptr[2] = act_param.Leaky_relu_alpha;
+        scale_ptr[3] = act_param.Leaky_relu_alpha;
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  }
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
+#else
+      float* pre_din = ptr_write + ow_round;
+#endif
+      /// const array size
+      float pre_out[out_c_block * out_w_kernel * out_h_kernel];  // NOLINT
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
+      }
+      float32x4_t vbias = vld1q_f32(bias_local);
+#ifdef __aarch64__
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
+      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
+      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
+      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc00 = dout_c00 + h * ow;
+        float* outc01 = outc00 + ow;
+        float* outc10 = outc00 + size_out_channel;
+        float* outc11 = outc10 + ow;
+        float* outc20 = outc10 + size_out_channel;
+        float* outc21 = outc20 + ow;
+        float* outc30 = outc20 + size_out_channel;
+        float* outc31 = outc30 + ow;
+        const float* inr0 = pre_din + h * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        const float* inr3 = inr2 + row_len;
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
+            case 3:
+              outc10 = ptr_write;
+              outc11 = ptr_write;
+            case 2:
+              outc20 = ptr_write;
+              outc21 = ptr_write;
+            case 1:
+              outc30 = ptr_write;
+              outc31 = ptr_write;
+            default:
+              break;
+          }
+        }
+        if (h + out_h_kernel > oh) {
+          outc01 = ptr_write;
+          outc11 = ptr_write;
+          outc21 = ptr_write;
+          outc31 = ptr_write;
+        }
+
+        float* outl[] = {outc00,
+                         outc10,
+                         outc20,
+                         outc30,
+                         outc01,
+                         outc11,
+                         outc21,
+                         outc31,
+                         reinterpret_cast<float*>(bias_local),
+                         reinterpret_cast<float*>(relu_ptr),
+                         reinterpret_cast<float*>(six_ptr),
+                         reinterpret_cast<float*>(scale_ptr)};
+        void* outl_ptr = reinterpret_cast<void*>(outl);
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          float* out0 = pre_out;
+#ifdef __aarch64__
+          act_switch_3x3s1(inr0,
+                           inr1,
+                           inr2,
+                           inr3,
+                           out0,
+                           weight_c,
+                           flag_mask,
+                           outl_ptr,
+                           w0,
+                           w1,
+                           w2,
+                           w3,
+                           w4,
+                           w5,
+                           w6,
+                           w7,
+                           w8,
+                           vbias,
+                           act_param);
+#else
+          act_switch_3x3s1(inr0,
+                           inr1,
+                           inr2,
+                           inr3,
+                           out0,
+                           weight_c,
+                           flag_mask,
+                           outl_ptr,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           act_param);
+#endif
+          outl[0] += 4;
+          outl[1] += 4;
+          outl[2] += 4;
+          outl[3] += 4;
+          outl[4] += 4;
+          outl[5] += 4;
+          outl[6] += 4;
+          outl[7] += 4;
+          inr0 += 16;
+          inr1 += 16;
+          inr2 += 16;
+          inr3 += 16;
+          if (flag_mask) {
+            memcpy(outl[0] - 4, pre_out, remain * sizeof(float));
+            memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float));
+            memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float));
+            memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float));
+            memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float));
+            memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float));
+            memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float));
+            memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float));
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc
deleted file mode 100644
index 2d75323a9677f1cfbed726a1a28920dd77131688..0000000000000000000000000000000000000000
--- a/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc
+++ /dev/null
@@ -1,361 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <arm_neon.h>
-#include "lite/backends/arm/math/conv_block_utils.h"
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/operators/op_params.h"
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_3x3s2_depthwise_fp32(const float* i_data,
-                               float* o_data,
-                               int bs,
-                               int oc,
-                               int oh,
-                               int ow,
-                               int ic,
-                               int ih,
-                               int win,
-                               const float* weights,
-                               const float* bias,
-                               const operators::ConvParam& param,
-                               ARMContext* ctx) {
-  int threads = ctx->threads();
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
-  const int out_c_block = 4;
-  const int out_h_kernel = 1;
-  const int out_w_kernel = 4;
-  const int win_ext = ow * 2 + 1;
-  const int ow_round = ROUNDUP(ow, 4);
-  const int win_round = ROUNDUP(win_ext, 4);
-  const int hin_round = oh * 2 + 1;
-  const int prein_size = win_round * hin_round * out_c_block;
-  auto workspace_size =
-      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
-  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
-
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-
-  /// get workspace
-  auto ptr_zero = ctx->workspace_data<float>();
-  memset(ptr_zero, 0, sizeof(float) * win_round);
-  float* ptr_write = ptr_zero + win_round;
-
-  int size_in_channel = win * ih;
-  int size_out_channel = ow * oh;
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int hs = -pad_h;
-  int he = hs + hin_round;
-  int w_loop = ow_round / 4;
-  auto remain = w_loop * 4 - ow;
-  bool flag_remain = remain > 0;
-  remain = 4 - remain;
-  remain = remain > 0 ? remain : 0;
-  int row_len = win_round * out_c_block;
-
-  for (int n = 0; n < bs; ++n) {
-    const float* din_batch = i_data + n * ic * size_in_channel;
-    float* dout_batch = o_data + n * oc * size_out_channel;
-#pragma omp parallel for num_threads(threads)
-    for (int c = 0; c < oc; c += out_c_block) {
-#ifdef ARM_WITH_OMP
-      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
-#else
-      float* pre_din = ptr_write + ow_round;
-#endif
-      /// const array size
-      prepack_input_nxwc4_dw(
-          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
-      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
-      float* dout_c00 = dout_batch + c * size_out_channel;
-      float bias_local[4] = {0, 0, 0, 0};
-      if (flag_bias) {
-        bias_local[0] = bias[c];
-        bias_local[1] = bias[c + 1];
-        bias_local[2] = bias[c + 2];
-        bias_local[3] = bias[c + 3];
-      }
-#ifdef __aarch64__
-      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
-      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
-      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
-      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
-      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
-      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
-      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
-      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
-      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
-#endif
-      for (int h = 0; h < oh; h += out_h_kernel) {
-        float* outc0 = dout_c00 + h * ow;
-        float* outc1 = outc0 + size_out_channel;
-        float* outc2 = outc1 + size_out_channel;
-        float* outc3 = outc2 + size_out_channel;
-        const float* inr0 = pre_din + h * 2 * row_len;
-        const float* inr1 = inr0 + row_len;
-        const float* inr2 = inr1 + row_len;
-        if (c + out_c_block > oc) {
-          switch (c + out_c_block - oc) {
-            case 3:
-              outc1 = ptr_write;
-            case 2:
-              outc2 = ptr_write;
-            case 1:
-              outc3 = ptr_write;
-            default:
-              break;
-          }
-        }
-        auto c0 = outc0;
-        auto c1 = outc1;
-        auto c2 = outc2;
-        auto c3 = outc3;
-        float pre_out[16];
-        for (int w = 0; w < w_loop; ++w) {
-          bool flag_mask = (w == w_loop - 1) && flag_remain;
-          if (flag_mask) {
-            c0 = outc0;
-            c1 = outc1;
-            c2 = outc2;
-            c3 = outc3;
-            outc0 = pre_out;
-            outc1 = pre_out + 4;
-            outc2 = pre_out + 8;
-            outc3 = pre_out + 12;
-          }
-// clang-format off
-#ifdef __aarch64__
-          asm volatile(
-          "ldr    q8, [%[bias]]\n"         /* load bias */
-          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
-          "and    v19.16b,  v8.16b, v8.16b\n"
-          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
-          "and    v20.16b,  v8.16b, v8.16b\n"
-          "ldp    q4, q5,   [%[inr0]], #32\n" /* load input r0*/
-          "and    v21.16b,  v8.16b, v8.16b\n"
-          "ldp    q6, q7,   [%[inr0]], #32\n" /* load input r0*/
-          "and    v22.16b,  v8.16b, v8.16b\n"
-          "ldr    q8,       [%[inr0]]\n"      /* load input r0*/
-          /*  r0 mul w0-w2, get out */
-          "fmla   v19.4s ,  %[w0].4s,  v0.4s\n" /* outr0 = w0 * r0, 0*/
-          "fmla   v20.4s ,  %[w0].4s,  v2.4s\n" /* outr1 = w0 * r0, 2*/
-          "fmla   v21.4s ,  %[w0].4s,  v4.4s\n" /* outr2 = w0 * r0, 4*/
-          "fmla   v22.4s ,  %[w0].4s,  v6.4s\n" /* outr3 = w0 * r0, 6*/
-          "fmla   v19.4s ,  %[w1].4s,  v1.4s\n" /* outr0 = w1 * r0, 1*/
-          "ldp    q0, q1,   [%[inr1]], #32\n"   /* load input r1*/
-          "fmla   v20.4s ,  %[w1].4s,  v3.4s\n" /* outr1 = w1 * r0, 3*/
-          "fmla   v21.4s ,  %[w1].4s,  v5.4s\n" /* outr2 = w1 * r0, 5*/
-          "fmla   v22.4s ,  %[w1].4s,  v7.4s\n" /* outr3 = w1 * r0, 7*/
-          "fmla   v19.4s ,  %[w2].4s,  v2.4s\n" /* outr0 = w0 * r0, 2*/
-          "ldp    q2, q3,   [%[inr1]], #32\n"   /* load input r1*/
-          "fmla   v20.4s ,  %[w2].4s,  v4.4s\n" /* outr1 = w0 * r0, 4*/
-          "ldp    q4, q5,   [%[inr1]], #32\n"   /* load input r1*/
-          "fmla   v21.4s ,  %[w2].4s,  v6.4s\n" /* outr2 = w0 * r0, 6*/
-          "ldp    q6, q7,   [%[inr1]], #32\n"   /* load input r1*/
-          "fmla   v22.4s ,  %[w2].4s,  v8.4s\n" /* outr3 = w0 * r0, 8*/
-          "ldr    q8,   [%[inr1]]\n"            /* load input r1*/
-          /*  r1, mul w3-w5, get out */
-          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr0 = w3 * r1, 0*/
-          "fmla   v20.4s ,  %[w3].4s,  v2.4s\n" /* outr1 = w3 * r1, 2*/
-          "fmla   v21.4s ,  %[w3].4s,  v4.4s\n" /* outr2 = w3 * r1, 4*/
-          "fmla   v22.4s ,  %[w3].4s,  v6.4s\n" /* outr3 = w3 * r1, 6*/
-          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr0 = w4 * r1, 1*/
-          "ldp    q0, q1,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v20.4s ,  %[w4].4s,  v3.4s\n" /* outr1 = w4 * r1, 3*/
-          "fmla   v21.4s ,  %[w4].4s,  v5.4s\n" /* outr2 = w4 * r1, 5*/
-          "fmla   v22.4s ,  %[w4].4s,  v7.4s\n" /* outr3 = w4 * r1, 7*/
-          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr0 = w5 * r1, 2*/
-          "ldp    q2, q3,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v20.4s ,  %[w5].4s,  v4.4s\n" /* outr1 = w5 * r1, 4*/
-          "ldp    q4, q5,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v21.4s ,  %[w5].4s,  v6.4s\n" /* outr2 = w5 * r1, 6*/
-          "ldp    q6, q7,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v22.4s ,  %[w5].4s,  v8.4s\n" /* outr3 = w5 * r1, 8*/
-          "ldr    q8,   [%[inr2]]\n"            /* load input r2*/
-          /*  r2, mul w6-w8, get out r0, r1 */
-          "fmla   v19.4s ,  %[w6].4s,  v0.4s\n" /* outr0 = w6 * r2, 0*/
-          "fmla   v20.4s ,  %[w6].4s,  v2.4s\n" /* outr1 = w6 * r2, 2*/
-          "fmla   v21.4s ,  %[w6].4s,  v4.4s\n" /* outr2 = w6 * r2, 4*/
-          "fmla   v22.4s ,  %[w6].4s,  v6.4s\n" /* outr3 = w6 * r2, 6*/
-          "fmla   v19.4s ,  %[w7].4s,  v1.4s\n" /* outr0 = w7 * r2, 1*/
-          "fmla   v20.4s ,  %[w7].4s,  v3.4s\n" /* outr1 = w7 * r2, 3*/
-          "fmla   v21.4s ,  %[w7].4s,  v5.4s\n" /* outr2 = w7 * r2, 5*/
-          "fmla   v22.4s ,  %[w7].4s,  v7.4s\n" /* outr3 = w7 * r2, 7*/
-          "fmla   v19.4s ,  %[w8].4s,  v2.4s\n" /* outr0 = w8 * r2, 2*/
-          "fmla   v20.4s ,  %[w8].4s,  v4.4s\n" /* outr1 = w8 * r2, 4*/
-          "fmla   v21.4s ,  %[w8].4s,  v6.4s\n" /* outr2 = w8 * r2, 6*/
-          "fmla   v22.4s ,  %[w8].4s,  v8.4s\n" /* outr3 = w8 * r2, 8*/
-          /* transpose */
-          "trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/
-          "trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/
-          "trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/
-          "trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/
-          "trn1 v19.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
-          "trn2 v21.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
-          "trn1 v20.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
-          "trn2 v22.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
-          /* relu */
-          "cbz  %w[flag_relu],  0f\n"    /* skip relu*/
-          "movi v0.4s, #0\n"             /* for relu */
-          "fmax v19.4s, v19.4s, v0.4s\n"
-          "fmax v20.4s, v20.4s, v0.4s\n"
-          "fmax v21.4s, v21.4s, v0.4s\n"
-          "fmax v22.4s, v22.4s, v0.4s\n"
-          /* save result */
-          "0:\n"
-          "str q19, [%[outc0]], #16\n"
-          "str q20, [%[outc1]], #16\n"
-          "str q21, [%[outc2]], #16\n"
-          "str q22, [%[outc3]], #16\n"
-          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
-          [inr2] "+r"(inr2),
-          [outc0]"+r"(outc0), [outc1]"+r"(outc1),
-          [outc2]"+r"(outc2), [outc3]"+r"(outc3)
-          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
-          [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
-          [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
-          [bias] "r" (bias_local), [flag_relu]"r"(flag_relu)
-          : "cc", "memory",
-                  "v0","v1","v2","v3","v4","v5","v6","v7",
-                  "v8", "v19","v20","v21","v22"
-          );
-#else
-          asm volatile(
-          /* fill with bias */
-          "vld1.32  {d16-d17}, [%[bias]]\n"   /* load bias */
-          /* load weights */
-          "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */
-          "vld1.32  {d0-d3},   [%[r0]]!\n"    /* load input r0, 0,1*/
-          "vand.i32 q12,  q8, q8\n"
-          "vld1.32  {d4-d7},   [%[r0]]!\n"    /* load input r0, 2,3*/
-          "vand.i32 q13,  q8, q8\n"
-          "vld1.32  {d8-d11},  [%[r0]]!\n"    /* load input r0, 4,5*/
-          "vand.i32 q14,  q8, q8\n"
-          "vld1.32  {d12-d15}, [%[r0]]!\n"    /* load input r0, 6,7*/
-          "vand.i32 q15,  q8, q8\n"
-          "vld1.32  {d16-d17}, [%[r0]]\n"     /* load input r0, 8*/
-          /* mul r0 with w0, w1, w2 */
-          "vmla.f32   q12, q9, q0               @ w0 * inr0\n"
-          "vmla.f32   q13, q9, q2               @ w0 * inr2\n"
-          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w2, to q11 */
-          "vmla.f32   q14, q9, q4               @ w0 * inr4\n"
-          "vmla.f32   q15, q9, q6               @ w0 * inr6\n"
-          "vmla.f32   q12, q10, q1              @ w1 * inr1\n"
-          "vld1.32    {d0-d3}, [%[r1]]!         @ load r1, 0, 1\n"
-          "vmla.f32   q13, q10, q3              @ w1 * inr3\n"
-          "vmla.f32   q14, q10, q5              @ w1 * inr5\n"
-          "vmla.f32   q15, q10, q7              @ w1 * inr7\n"
-          "vld1.32    {d18-d21}, [%[wc0]]!\n"  /* load w3-4, to q9-10 */
-          "vmla.f32   q12, q11, q2              @ w2 * inr2\n"
-          "vld1.32    {d4-d7}, [%[r1]]!         @ load r1, 2, 3\n"
-          "vmla.f32   q13, q11, q4              @ w2 * inr4\n"
-          "vld1.32    {d8-d11}, [%[r1]]!        @ load r1, 4, 5\n"
-          "vmla.f32   q14, q11, q6              @ w2 * inr6\n"
-          "vld1.32    {d12-d15}, [%[r1]]!       @ load r1, 6, 7\n"
-          "vmla.f32   q15, q11, q8              @ w2 * inr8\n"
-          /* mul r1 with w3, w4, w5 */
-          "vmla.f32   q12, q9, q0               @ w3 * inr0\n"
-          "vmla.f32   q13, q9, q2               @ w3 * inr2\n"
-          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w5, to q11 */
-          "vmla.f32   q14, q9, q4               @ w3 * inr4\n"
-          "vmla.f32   q15, q9, q6               @ w3 * inr6\n"
-          "vld1.32    {d16-d17}, [%[r1]]\n"     /* load input r1, 8*/
-          "vmla.f32   q12, q10, q1              @ w4 * inr1\n"
-          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 0, 1\n"
-          "vmla.f32   q13, q10, q3              @ w4 * inr3\n"
-          "vmla.f32   q14, q10, q5              @ w4 * inr5\n"
-          "vmla.f32   q15, q10, q7              @ w4 * inr7\n"
-          "vld1.32    {d18-d21}, [%[wc0]]!\n"   /* load w6-7, to q9-10 */
-          "vmla.f32   q12, q11, q2              @ w5 * inr2\n"
-          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, 2, 3\n"
-          "vmla.f32   q13, q11, q4              @ w5 * inr4\n"
-          "vld1.32    {d8-d11}, [%[r2]]!        @ load r2, 4, 5\n"
-          "vmla.f32   q14, q11, q6              @ w5 * inr6\n"
-          "vld1.32    {d12-d15}, [%[r2]]!       @ load r2, 6, 7\n"
-          "vmla.f32   q15, q11, q8              @ w5 * inr8\n"
-          /* mul r2 with w6, w7, w8 */
-          "vmla.f32   q12, q9, q0               @ w6 * inr0\n"
-          "vmla.f32   q13, q9, q2               @ w6 * inr2\n"
-          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w8, to q11 */
-          "vmla.f32   q14, q9, q4               @ w6 * inr4\n"
-          "vmla.f32   q15, q9, q6               @ w6 * inr6\n"
-          "vld1.32    {d16-d17}, [%[r2]]\n"     /* load input r2, 8*/
-          "vmla.f32   q12, q10, q1              @ w7 * inr1\n"
-          "vmla.f32   q13, q10, q3              @ w7 * inr3\n"
-          "vmla.f32   q14, q10, q5              @ w7 * inr5\n"
-          "vmla.f32   q15, q10, q7              @ w7 * inr7\n"
-          "sub    %[wc0], %[wc0], #144          @ wc0 - 144 to start address\n"
-          "vmla.f32   q12, q11, q2              @ w8 * inr2\n"
-          "vmla.f32   q13, q11, q4              @ w8 * inr4\n"
-          "vmla.f32   q14, q11, q6              @ w8 * inr6\n"
-          "vmla.f32   q15, q11, q8              @ w8 * inr8\n"
-          /* transpose */
-          "vtrn.32 q12, q13\n"    /* a0a1c0c1, b0b1d0d1*/
-          "vtrn.32 q14, q15\n"    /* a2a3c2c3, b2b3d2d3*/
-          "vswp   d25, d28\n"     /* a0a1a2a3, c0c1c2c3*/
-          "vswp   d27, d30\n"     /* b0b1b2b3, d0d1d2d3*/
-          "cmp  %[flag_relu], #0\n"
-          "beq  0f\n"             /* skip relu*/
-          "vmov.u32 q0, #0\n"
-          "vmax.f32 q12, q12, q0\n"
-          "vmax.f32 q13, q13, q0\n"
-          "vmax.f32 q14, q14, q0\n"
-          "vmax.f32 q15, q15, q0\n"
-          "0:\n"
-          "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/
-          "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/
-          "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/
-          "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
-          :[r0] "+r"(inr0), [r1] "+r"(inr1),
-           [r2] "+r"(inr2), [wc0] "+r" (weight_c),
-           [outc0]"+r"(outc0), [outc1]"+r"(outc1),
-           [outc2]"+r"(outc2), [outc3]"+r"(outc3)
-          :[bias] "r" (bias_local),
-           [flag_relu]"r"(flag_relu)
-          :"cc", "memory",
-            "q0","q1","q2","q3","q4","q5","q6","q7",
-            "q8", "q9","q10","q11","q12","q13","q14","q15"
-          );
-#endif  //  __arch64__
-          // clang-format off
-          if (flag_mask) {
-            for (int i = 0; i < remain; ++i) {
-              c0[i] = pre_out[i];
-              c1[i] = pre_out[i + 4];
-              c2[i] = pre_out[i + 8];
-              c3[i] = pre_out[i + 12];
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
index 8260718a50f8e2fa8497d41d958e82a45ea0480d..f5b196efcca3f3f35367f2fea5e8f475b7147f48 100644
--- a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
@@ -32,10 +32,11 @@ size_t conv3x3s2_direct_workspace_size(const operators::ConvParam& param,
                                        ARMContext* ctx) {
   auto dim_in = param.x->dims();
   auto dim_out = param.output->dims();
+  auto paddings = *param.paddings;
   const int threads = ctx->threads();
   int llc_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
   int ow = dim_out[3];
   int oh = dim_out[2];
   int ic = dim_in[1];
@@ -73,10 +74,12 @@ void conv_3x3s2_direct_fp32(const float* i_data,
   //! 3x3s2 convolution, implemented by direct algorithm
   //! prepack input to tmp buffer
   //! write output to tmp buffer
+  auto paddings = *param.paddings;
+  auto act_param = param.activation_param;
   const int threads = ctx->threads();
   int l2_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
   const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
   const int win_round = wout_round * 2 /*stride_w*/ + 1;
   bool flag_relu = param.fuse_relu;
@@ -508,7 +511,8 @@ void conv_3x3s2_direct_fp32(const float* i_data,
                                 oh,
                                 ow,
                                 flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
       }
 
 #pragma omp parallel for num_threads(threads)
@@ -837,7 +841,8 @@ void conv_3x3s2_direct_fp32(const float* i_data,
                                 oh,
                                 ow,
                                 flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
       }
     }
   }
diff --git a/lite/backends/arm/math/conv3x3s2_direct_int8.cc b/lite/backends/arm/math/conv3x3s2_direct_int8.cc
index 01b7a812ebc05a054bb9952bf53605ce7aed135a..3d6f3dd743c3e46b6123f2c93dbfed586ad7b4c6 100644
--- a/lite/backends/arm/math/conv3x3s2_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_int8.cc
@@ -46,10 +46,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
   //! 3x3s2 int8 convolution, implemented by direct algorithm
   //! prepack input to tmp buffer
   //! write output to tmp buffer
+  auto paddings = *param.paddings;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
 
   const int threads = ctx->threads();
   int llc_size = ctx->llc_size() / 4;
@@ -472,10 +473,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
   //! 3x3s2 int8 convolution, implemented by direct algorithm
   //! prepack input to tmp buffer
   //! write output to tmp buffer
+  auto paddings = *param.paddings;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   const int threads = ctx->threads();
   //! set 1/4 l2 cache
   int llc_size = ctx->llc_size() / 4;
diff --git a/lite/backends/arm/math/conv_depthwise_3x3s2.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
similarity index 60%
rename from lite/backends/arm/math/conv_depthwise_3x3s2.cc
rename to lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
index ec039af98cb7e4fb037475dd4e5ee29204252165..3e5569365119b97397c6d42f48bacd2552b248e5 100644
--- a/lite/backends/arm/math/conv_depthwise_3x3s2.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/arm/math/conv_depthwise.h"
 #include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_depthwise.h"
 
 namespace paddle {
 namespace lite {
@@ -24,13 +25,13 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
                                  const float* weights,
                                  const float* bias,
                                  bool flag_bias,
-                                 bool flag_relu,
                                  const int num,
                                  const int ch_in,
                                  const int h_in,
                                  const int w_in,
                                  const int h_out,
                                  const int w_out,
+                                 const operators::ActivationParam act_param,
                                  ARMContext* ctx);
 
 void conv_depthwise_3x3s2p0_bias_s(float* dout,
@@ -38,13 +39,13 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
                                    const float* weights,
                                    const float* bias,
                                    bool flag_bias,
-                                   bool flag_relu,
                                    const int num,
                                    const int ch_in,
                                    const int h_in,
                                    const int w_in,
                                    const int h_out,
                                    const int w_out,
+                                   const operators::ActivationParam act_param,
                                    ARMContext* ctx);
 
 void conv_depthwise_3x3s2p1_bias(float* dout,
@@ -52,13 +53,13 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
                                  const float* weights,
                                  const float* bias,
                                  bool flag_bias,
-                                 bool flag_relu,
                                  const int num,
                                  const int ch_in,
                                  const int h_in,
                                  const int w_in,
                                  const int h_out,
                                  const int w_out,
+                                 const operators::ActivationParam act_param,
                                  ARMContext* ctx);
 
 void conv_depthwise_3x3s2p1_bias_s(float* dout,
@@ -66,13 +67,13 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
                                    const float* weights,
                                    const float* bias,
                                    bool flag_bias,
-                                   bool flag_relu,
                                    const int num,
                                    const int ch_in,
                                    const int h_in,
                                    const int w_in,
                                    const int h_out,
                                    const int w_out,
+                                   const operators::ActivationParam act_param,
                                    ARMContext* ctx);
 
 void conv_depthwise_3x3s2_fp32(const float* din,
@@ -88,7 +89,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                const float* bias,
                                int pad,
                                bool flag_bias,
-                               bool flag_relu,
+                               const operators::ActivationParam act_param,
                                ARMContext* ctx) {
   if (pad == 0) {
     if (w_in > 7) {
@@ -97,13 +98,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                   weights,
                                   bias,
                                   flag_bias,
-                                  flag_relu,
                                   num,
                                   ch_in,
                                   h_in,
                                   w_in,
                                   h_out,
                                   w_out,
+                                  act_param,
                                   ctx);
     } else {
       conv_depthwise_3x3s2p0_bias_s(dout,
@@ -111,13 +112,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                     weights,
                                     bias,
                                     flag_bias,
-                                    flag_relu,
                                     num,
                                     ch_in,
                                     h_in,
                                     w_in,
                                     h_out,
                                     w_out,
+                                    act_param,
                                     ctx);
     }
   }
@@ -128,13 +129,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                   weights,
                                   bias,
                                   flag_bias,
-                                  flag_relu,
                                   num,
                                   ch_in,
                                   h_in,
                                   w_in,
                                   h_out,
                                   w_out,
+                                  act_param,
                                   ctx);
     } else {
       conv_depthwise_3x3s2p1_bias_s(dout,
@@ -142,13 +143,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                     weights,
                                     bias,
                                     flag_bias,
-                                    flag_relu,
                                     num,
                                     ch_in,
                                     h_in,
                                     w_in,
                                     h_out,
                                     w_out,
+                                    act_param,
                                     ctx);
     }
   }
@@ -205,14 +206,12 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                                           \
   "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"                        \
   "fadd v16.4s, v16.4s, v11.4s                  \n"                       \
-  "fadd v16.4s, v16.4s, v12.4s                  \n"
+  "fadd v16.4s, v16.4s, v12.4s                  \n" /* r4 */              \
+  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"                          \
+  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"                          \
+  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"
 
 #define LEFT_RESULT_S2                              \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
-                                                    \
   "st1 {v16.4s}, [%[outptr0]], #16              \n" \
                                                     \
   "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
@@ -244,53 +243,52 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                     \
   "blt 1f                                     \n"
 
-#define MID_COMPUTE_S2                                      \
-  "2:                                          \n" /* r0 */ \
-  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"            \
-  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"            \
-  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v2.16b, v18.16b, #4     \n"                \
-  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" /* r1 */    \
-  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"            \
-  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"            \
-  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v4.16b, v19.16b, #4     \n"                \
-                                                            \
-  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n" /* r2 */    \
-  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"            \
-  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"            \
-                                                            \
-  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"            \
-  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"            \
-                                                            \
-  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"           \
-  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v6.16b, v20.16b, #4     \n"                \
-                                                            \
-  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n" /* r3 */    \
-  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"            \
-  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"            \
-  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v8.16b, v21.16b, #4     \n"                \
-                                                            \
-  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"             \
-                                                            \
-  "fadd v16.4s, v16.4s, v11.4s                  \n"         \
-  "fadd v16.4s, v16.4s, v12.4s                  \n"
+#define MID_COMPUTE_S2                                       \
+  "2:                                          \n" /* r0 */  \
+  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"             \
+  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"             \
+  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"            \
+                                                             \
+  "ext  v10.16b, v2.16b, v18.16b, #4     \n"                 \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" /* r1 */     \
+  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"             \
+  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"             \
+  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"            \
+                                                             \
+  "ext  v10.16b, v4.16b, v19.16b, #4     \n"                 \
+                                                             \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n" /* r2 */     \
+  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"             \
+  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"             \
+                                                             \
+  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"             \
+  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"             \
+                                                             \
+  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"            \
+  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"            \
+                                                             \
+  "ext  v10.16b, v6.16b, v20.16b, #4     \n"                 \
+                                                             \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n" /* r3 */     \
+  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"             \
+  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"             \
+  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"            \
+                                                             \
+  "ext  v10.16b, v8.16b, v21.16b, #4     \n"                 \
+                                                             \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"              \
+                                                             \
+  "fadd v16.4s, v16.4s, v11.4s                  \n"          \
+  "fadd v16.4s, v16.4s, v12.4s                  \n" /* r4 */ \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"             \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"             \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"            \
+                                                             \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"              \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"             \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"
 
 #define MID_RESULT_S2                               \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
-                                                    \
-  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
-  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
-  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
   "st1 {v16.4s}, [%[outptr0]], #16              \n" \
                                                     \
   "fadd v17.4s, v17.4s, v13.4s                  \n" \
@@ -360,14 +358,12 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                            \
   "fadd v16.4s, v16.4s, v11.4s                  \n"        \
   "fadd v16.4s, v16.4s, v12.4s                  \n"        \
-  "ld1 {v1.4s}, [%[outptr1]]                  \n"
+  "ld1 {v1.4s}, [%[outptr1]]                  \n" /* r4 */ \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"           \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"           \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"
 
 #define RIGHT_RESULT_S2                             \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
-                                                    \
   "bif  v16.16b, v0.16b, %[wmask].16b    \n"        \
                                                     \
   "fadd v17.4s, v17.4s, v13.4s                  \n" \
@@ -382,11 +378,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "4:                                          \n"
 
 #define LEFT_RESULT_S2_RELU                         \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
-                                                    \
   "fmax v16.4s, v16.4s, %[vzero].4s            \n"  \
                                                     \
   "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
@@ -422,16 +413,85 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
                                                     \
   "blt 1f                                     \n"
+#define LEFT_RESULT_S2_RELU6                        \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n"  \
+  "ld1 {v22.4s}, [%[six_ptr]]                  \n"  \
+                                                    \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"     \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"     \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+  "fmin v16.4s, v16.4s, v22.4s                  \n" \
+                                                    \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"     \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
+                                                    \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
+                                                    \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n"  \
+                                                    \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
+                                                    \
+  "fmin v17.4s, v17.4s, v22.4s                  \n" \
+                                                    \
+  "cmp %w[cnt], #1                             \n"  \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "blt 1f                                     \n"
+
+#define LEFT_RESULT_S2_LEAKY_RELU                         \
+  "ld1 {v22.4s}, [%[scale_ptr]]                  \n"      \
+  "fcmge v11.4s, v16.4s,  %[vzero].4s \n" /* vcgeq_f32 */ \
+                                                          \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"           \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"           \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"           \
+                                                          \
+  "fmul v12.4s, v16.4s, v22.4s                  \n"       \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"       \
+                                                          \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"           \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"           \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"          \
+                                                          \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"       \
+  "bif  v16.16b, v12.16b, v11.16b \n" /* choose*/         \
+                                                          \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"          \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"          \
+                                                          \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"              \
+                                                          \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"       \
+  "fcmge v11.4s, v17.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v12.4s, v16.4s, v22.4s                  \n"       \
+                                                          \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"          \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"          \
+                                                          \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"          \
+  "bif v17.16b, v12.16b, v11.16b \n" /* choose*/          \
+                                                          \
+  "cmp %w[cnt], #1                             \n"        \
+                                                          \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"       \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"          \
+                                                          \
+  "blt 1f                                     \n"
 
 #define MID_RESULT_S2_RELU                                    \
-  /* r4 */                                                    \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
-                                                              \
-  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"               \
-  "ld1 {v15.4s}, [%[inptr0]]                 \n"              \
-  "ld1 {v18.4s}, [%[inptr1]]                 \n"              \
   "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
                                                               \
   "fadd v17.4s, v17.4s, v13.4s                  \n"           \
@@ -456,12 +516,59 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                               \
   "bne  2b                                    \n"
 
-#define RIGHT_RESULT_S2_RELU                                  \
-  /* r4 */                                                    \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
+#define MID_RESULT_S2_RELU6                                   \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
+                                                              \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"              \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"              \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"              \
                                                               \
+  "fmin v16.4s, v16.4s, v22.4s                  \n"           \
+                                                              \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
+                                                              \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"                  \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
+  "subs %w[cnt], %w[cnt], #1                    \n"           \
+                                                              \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"              \
+  "fmin v17.4s, v17.4s, v22.4s                  \n"           \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
+                                                              \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"              \
+                                                              \
+  "bne  2b                                    \n"
+
+#define MID_RESULT_S2_LEAKY_RELU                          \
+  "fcmge v11.4s, v16.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v12.4s, v16.4s, v22.4s                  \n"       \
+                                                          \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"       \
+                                                          \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"          \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"          \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"          \
+                                                          \
+  "bif  v16.16b, v12.16b, v11.16b \n" /* choose*/         \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"              \
+  "fcmge v11.4s, v17.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v12.4s, v17.4s, v22.4s                  \n"       \
+                                                          \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"       \
+  "subs %w[cnt], %w[cnt], #1                    \n"       \
+                                                          \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"          \
+  "bif  v17.16b, v12.16b, v11.16b \n" /* choose*/         \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"       \
+                                                          \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"          \
+                                                          \
+  "bne  2b                                    \n"
+
+#define RIGHT_RESULT_S2_RELU                                  \
   "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
                                                               \
   "fadd v17.4s, v17.4s, v13.4s                  \n"           \
@@ -479,6 +586,47 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
   "4:                                          \n"
 
+#define RIGHT_RESULT_S2_RELU6                                 \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
+                                                              \
+  "fmin v16.4s, v16.4s, v22.4s                  \n"           \
+                                                              \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
+                                                              \
+  "bif  v16.16b, v0.16b, %[wmask].16b    \n"                  \
+                                                              \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
+  "fmin v17.4s, v17.4s, v22.4s                  \n"           \
+  "bif  v17.16b, v1.16b, %[wmask].16b    \n"                  \
+                                                              \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
+  "4:                                          \n"
+
+#define RIGHT_RESULT_S2_LEAKY_RELU                        \
+  "fcmge v11.4s, v16.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v12.4s, v16.4s, v22.4s                  \n"       \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"       \
+                                                          \
+  "bif  v16.16b, v12.16b, v11.16b \n" /* choose*/         \
+                                                          \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"       \
+                                                          \
+  "bif  v16.16b, v0.16b, %[wmask].16b    \n"              \
+                                                          \
+  "fcmge v11.4s, v17.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v12.4s, v17.4s, v22.4s                  \n"       \
+                                                          \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"       \
+  "bif  v17.16b, v12.16b, v11.16b \n" /* choose*/         \
+  "bif  v17.16b, v1.16b, %[wmask].16b    \n"              \
+                                                          \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"       \
+  "4:                                          \n"
+
 #define COMPUTE_S_S2                                  \
   "movi v9.4s, #0                                 \n" \
   "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n" \
@@ -523,7 +671,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "fmax v4.4s, v4.4s, v9.4s                       \n" \
                                                       \
   "st1 {v4.4s}, [%[out]]                          \n"
-
 #define COMPUTE_S_S2_P0                                \
   "movi v9.4s, #0                                 \n"  \
   "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"  \
@@ -560,7 +707,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "fadd v4.4s, v4.4s, v16.4s                       \n"
 
 #define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]]                          \n"
-
 #define RESULT_S_S2_P0_RELU                           \
   "fmax v4.4s, v4.4s, v9.4s                       \n" \
   "st1 {v4.4s}, [%[out]]                          \n"
@@ -705,7 +851,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
   "cmp %[cnt], #1                                 \n" \
   "blt 1f                                         \n"
-
 #define MID_RESULT_S2_RELU                            \
   "vmax.f32 q3, q3, q9                    @ relu \n"  \
   "subs %[cnt], #1                                \n" \
@@ -762,7 +907,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "vadd.f32 q3, q3, q5                            @ add \n"
 
 #define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]]                            \n"
-
 #define RESULT_S_S2_RELU                                    \
   "vmax.f32 q3, q3, q9                            @ relu\n" \
                                                             \
@@ -810,13 +954,233 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "vadd.f32 q3, q3, q5                            @ add \n"
 
 #define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]]                            \n"
-
 #define RESULT_S_S2_P0_RELU                                  \
   "vmax.f32 q3, q3, q9                            @ relu \n" \
   "vst1.32 {d6-d7}, [%[out]]                            \n"
-
 #endif
-
+#ifdef __aarch64__
+void act_switch_3x3s2p1(const float* din0_ptr,
+                        const float* din1_ptr,
+                        const float* din2_ptr,
+                        const float* din3_ptr,
+                        const float* din4_ptr,
+                        float* doutr0_ptr,
+                        float* doutr1_ptr,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        uint32x4_t vmask_rp1,
+                        uint32x4_t vmask_rp2,
+                        uint32x4_t wmask,
+                        float32x4_t wbias,
+                        float32x4_t vzero,
+                        int cnt,
+                        int cnt_remain,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(
+            INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(
+            INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2
+                MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [six_ptr] "r"(vsix),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU
+                         MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
+                             RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU
+                     : [inptr0] "+r"(din0_ptr),
+                       [inptr1] "+r"(din1_ptr),
+                       [inptr2] "+r"(din2_ptr),
+                       [inptr3] "+r"(din3_ptr),
+                       [inptr4] "+r"(din4_ptr),
+                       [outptr0] "+r"(doutr0_ptr),
+                       [outptr1] "+r"(doutr1_ptr),
+                       [cnt] "+r"(cnt)
+                     : [vzero] "w"(vzero),
+                       [w0] "w"(wr0),
+                       [w1] "w"(wr1),
+                       [w2] "w"(wr2),
+                       [remain] "r"(cnt_remain),
+                       [scale_ptr] "r"(vscale),
+                       [mask1] "w"(vmask_rp1),
+                       [mask2] "w"(vmask_rp2),
+                       [wmask] "w"(wmask),
+                       [vbias] "w"(wbias)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
+                     MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
+                 : [inptr0] "+r"(din0_ptr),
+                   [inptr1] "+r"(din1_ptr),
+                   [inptr2] "+r"(din2_ptr),
+                   [inptr3] "+r"(din3_ptr),
+                   [inptr4] "+r"(din4_ptr),
+                   [outptr0] "+r"(doutr0_ptr),
+                   [outptr1] "+r"(doutr1_ptr),
+                   [cnt] "+r"(cnt)
+                 : [vzero] "w"(vzero),
+                   [w0] "w"(wr0),
+                   [w1] "w"(wr1),
+                   [w2] "w"(wr2),
+                   [remain] "r"(cnt_remain),
+                   [mask1] "w"(vmask_rp1),
+                   [mask2] "w"(vmask_rp2),
+                   [wmask] "w"(wmask),
+                   [vbias] "w"(wbias)
+                 : "cc",
+                   "memory",
+                   "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v12",
+                   "v13",
+                   "v14",
+                   "v15",
+                   "v16",
+                   "v17",
+                   "v18",
+                   "v19",
+                   "v20",
+                   "v21");
+  }
+}
+#endif
 /**
  * \brief depthwise convolution kernel 3x3, stride 2
  * w_in > 7
@@ -826,27 +1190,29 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
                                  const float* weights,
                                  const float* bias,
                                  bool flag_bias,
-                                 bool flag_relu,
                                  const int num,
                                  const int ch_in,
                                  const int h_in,
                                  const int w_in,
                                  const int h_out,
                                  const int w_out,
+                                 const operators::ActivationParam act_param,
                                  ARMContext* ctx) {
   int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
   int out_pad_idx[4] = {0, 1, 2, 3};
   int size_pad_bottom = h_out * 2 - h_in;
 
-  int cnt_col = (w_out >> 2) - 2;
-  int size_right_remain = w_in - (7 + cnt_col * 8);
-  if (size_right_remain >= 9) {
-    cnt_col++;
-    size_right_remain -= 8;
-  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
+  int tile_w = w_out >> 2;
+  int cnt_remain = w_out % 4;
+  unsigned int size_right_remain = (unsigned int)(7 + (tile_w << 3) - w_in);
+  size_right_remain = 8 - size_right_remain;
 
-  int size_right_pad = w_out * 2 - w_in;
+  if (cnt_remain == 0 && size_right_remain == 0) {
+    cnt_remain = 4;
+    tile_w -= 1;
+    size_right_remain = 8;
+  }
+  int cnt_col = tile_w - 1;
 
   uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
                                    vld1q_s32(right_pad_idx));  // 0 2 4 6
@@ -912,7 +1278,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
       float* doutr1_ptr = nullptr;
 
 #ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
+      for (int i = 0; i < h_out; i += 2) {
         din0_ptr = dr0;
         din1_ptr = dr1;
         din2_ptr = dr2;
@@ -939,8 +1305,8 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
         dr4 = dr3 + w_in;
 
         //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
+        if (i * 2 + 4 > h_in) {
+          switch (i * 2 + 4 - h_in) {
             case 4:
               din1_ptr = zero_ptr;
             case 3:
@@ -954,104 +1320,32 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
           }
         }
         //! process output pad
-        if (i / 2 + 2 > h_out) {
+        if (i + 2 > h_out) {
           doutr1_ptr = write_ptr;
         }
         int cnt = cnt_col;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
-                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
-              : [inptr0] "+r"(din0_ptr),
-                [inptr1] "+r"(din1_ptr),
-                [inptr2] "+r"(din2_ptr),
-                [inptr3] "+r"(din3_ptr),
-                [inptr4] "+r"(din4_ptr),
-                [outptr0] "+r"(doutr0_ptr),
-                [outptr1] "+r"(doutr1_ptr),
-                [cnt] "+r"(cnt)
-              : [vzero] "w"(vzero),
-                [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [remain] "r"(cnt_remain),
-                [mask1] "w"(vmask_rp1),
-                [mask2] "w"(vmask_rp2),
-                [wmask] "w"(wmask),
-                [vbias] "w"(wbias)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21");
-        } else {
-          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
-                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
-                       : [inptr0] "+r"(din0_ptr),
-                         [inptr1] "+r"(din1_ptr),
-                         [inptr2] "+r"(din2_ptr),
-                         [inptr3] "+r"(din3_ptr),
-                         [inptr4] "+r"(din4_ptr),
-                         [outptr0] "+r"(doutr0_ptr),
-                         [outptr1] "+r"(doutr1_ptr),
-                         [cnt] "+r"(cnt)
-                       : [vzero] "w"(vzero),
-                         [w0] "w"(wr0),
-                         [w1] "w"(wr1),
-                         [w2] "w"(wr2),
-                         [remain] "r"(cnt_remain),
-                         [mask1] "w"(vmask_rp1),
-                         [mask2] "w"(vmask_rp2),
-                         [wmask] "w"(wmask),
-                         [vbias] "w"(wbias)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17",
-                         "v18",
-                         "v19",
-                         "v20",
-                         "v21");
-        }
+        act_switch_3x3s2p1(din0_ptr,
+                           din1_ptr,
+                           din2_ptr,
+                           din3_ptr,
+                           din4_ptr,
+                           doutr0_ptr,
+                           doutr1_ptr,
+                           wr0,
+                           wr1,
+                           wr2,
+                           vmask_rp1,
+                           vmask_rp2,
+                           wmask,
+                           wbias,
+                           vzero,
+                           cnt,
+                           cnt_remain,
+                           act_param);
         doutr0 = doutr0 + 2 * w_out;
       }
 #else
-      for (int i = 0; i < h_in; i += 2) {
+      for (int i = 0; i < h_out; i++) {
         din0_ptr = dr0;
         din1_ptr = dr1;
         din2_ptr = dr2;
@@ -1072,8 +1366,8 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
         }
 
         //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
+        if (i * 2 + 2 > h_in) {
+          switch (i * 2 + 2 - h_in) {
             case 2:
               din1_ptr = zero_ptr;
             case 1:
@@ -1084,65 +1378,37 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
         }
         int cnt = cnt_col;
         unsigned int* mask_ptr = dmask;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
-                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
-              : [din0_ptr] "+r"(din0_ptr),
-                [din1_ptr] "+r"(din1_ptr),
-                [din2_ptr] "+r"(din2_ptr),
-                [outptr] "+r"(doutr0_ptr),
-                [cnt] "+r"(cnt),
-                [mask_ptr] "+r"(mask_ptr)
-              : [remain] "r"(cnt_remain),
-                [wr0] "w"(wr0),
-                [wr1] "w"(wr1),
-                [wr2] "w"(wr2),
-                [bias] "r"(bias_c)
-              : "cc",
-                "memory",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-        } else {
-          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
-                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [outptr] "+r"(doutr0_ptr),
-                         [cnt] "+r"(cnt),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [remain] "r"(cnt_remain),
-                         [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
+        asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
+                         MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [outptr] "+r"(doutr0_ptr),
+                       [cnt] "+r"(cnt),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [remain] "r"(cnt_remain),
+                       [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "r"(bias_c)
+                     : "cc",
+                       "memory",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        // do act
+        if (act_param.has_active) {
+          act_switch_process(doutr0, doutr0, w_out, &act_param);
         }
         doutr0 = doutr0 + w_out;
       }
@@ -1159,13 +1425,13 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
                                    const float* weights,
                                    const float* bias,
                                    bool flag_bias,
-                                   bool flag_relu,
                                    const int num,
                                    const int ch_in,
                                    const int h_in,
                                    const int w_in,
                                    const int h_out,
                                    const int w_out,
+                                   const operators::ActivationParam act_param,
                                    ARMContext* ctx) {
   int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
   int out_pad_idx[4] = {0, 1, 2, 3};
@@ -1221,108 +1487,59 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
 
         unsigned int* mask_ptr = dmask;
 #ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
-                         [out] "r"(out_buf)
-                       : "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
-        } else {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
-                         [out] "r"(out_buf)
-                       : "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
-        }
+        asm volatile(COMPUTE_S_S2 RESULT_S_S2
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "w"(vbias),
+                       [out] "r"(out_buf)
+                     : "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
 #else
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c),
-                         [out] "r"(out_buf)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c),
-                         [out] "r"(out_buf)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
+        asm volatile(COMPUTE_S_S2 RESULT_S_S2
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "r"(bias_c),
+                       [out] "r"(out_buf)
+                     : "cc",
+                       "memory",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
 #endif
+        // do act
+        if (act_param.has_active) {
+          act_switch_process(out_buf, out_buf, w_out, &act_param);
+        }
         for (int w = 0; w < w_out; ++w) {
           *dout_channel++ = out_buf[w];
         }
@@ -1333,6 +1550,271 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
   }
 }
 
+#ifdef __aarch64__
+void act_switch_3x3s2p0(const float* din0_ptr,
+                        const float* din1_ptr,
+                        const float* din2_ptr,
+                        const float* din3_ptr,
+                        const float* din4_ptr,
+                        float* doutr0_ptr,
+                        float* doutr1_ptr,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        uint32x4_t vmask_rp1,
+                        uint32x4_t vmask_rp2,
+                        uint32x4_t wmask,
+                        float32x4_t wbias,
+                        float32x4_t vzero,
+                        int cnt,
+                        int cnt_remain,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(
+            INIT_S2
+            "ld1 {v15.4s}, [%[inptr0]]                 \n"
+            "ld1 {v18.4s}, [%[inptr1]]                 \n"
+            "ld1 {v19.4s}, [%[inptr2]]                 \n"
+            "ld1 {v20.4s}, [%[inptr3]]                 \n"
+            "ld1 {v21.4s}, [%[inptr4]]                 \n"
+            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+            MID_COMPUTE_S2 MID_RESULT_S2_RELU
+            "cmp %w[remain], #1                           \n"
+            "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                RIGHT_RESULT_S2_RELU
+            "4:                                          \n"
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(
+            INIT_S2
+            "ld1 {v15.4s}, [%[inptr0]]                 \n"
+            "ld1 {v18.4s}, [%[inptr1]]                 \n"
+            "ld1 {v19.4s}, [%[inptr2]]                 \n"
+            "ld1 {v20.4s}, [%[inptr3]]                 \n"
+            "ld1 {v21.4s}, [%[inptr4]]                 \n"
+            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+            "ld1 {v22.4s}, [%[six_ptr]]                  \n" MID_COMPUTE_S2
+                MID_RESULT_S2_RELU6
+            "cmp %w[remain], #1                           \n"
+            "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                RIGHT_RESULT_S2_RELU6
+            "4:                                          \n"
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [six_ptr] "r"(vsix),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(
+            INIT_S2
+            "ld1 {v15.4s}, [%[inptr0]]                 \n"
+            "ld1 {v18.4s}, [%[inptr1]]                 \n"
+            "ld1 {v19.4s}, [%[inptr2]]                 \n"
+            "ld1 {v20.4s}, [%[inptr3]]                 \n"
+            "ld1 {v21.4s}, [%[inptr4]]                 \n"
+            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+            "ld1 {v22.4s}, [%[scale_ptr]]                  \n" MID_COMPUTE_S2
+                MID_RESULT_S2_LEAKY_RELU
+            "cmp %w[remain], #1                           \n"
+            "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                RIGHT_RESULT_S2_LEAKY_RELU
+            "4:                                          \n"
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [scale_ptr] "r"(vscale),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(
+        INIT_S2
+        "ld1 {v15.4s}, [%[inptr0]]                 \n"
+        "ld1 {v18.4s}, [%[inptr1]]                 \n"
+        "ld1 {v19.4s}, [%[inptr2]]                 \n"
+        "ld1 {v20.4s}, [%[inptr3]]                 \n"
+        "ld1 {v21.4s}, [%[inptr4]]                 \n"
+        "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+        MID_COMPUTE_S2 MID_RESULT_S2
+        "cmp %w[remain], #1                           \n"
+        "blt 4f                                     \n" RIGHT_COMPUTE_S2
+            RIGHT_RESULT_S2 "4:                                          \n"
+        : [inptr0] "+r"(din0_ptr),
+          [inptr1] "+r"(din1_ptr),
+          [inptr2] "+r"(din2_ptr),
+          [inptr3] "+r"(din3_ptr),
+          [inptr4] "+r"(din4_ptr),
+          [outptr0] "+r"(doutr0_ptr),
+          [outptr1] "+r"(doutr1_ptr),
+          [cnt] "+r"(cnt)
+        : [vzero] "w"(vzero),
+          [w0] "w"(wr0),
+          [w1] "w"(wr1),
+          [w2] "w"(wr2),
+          [remain] "r"(cnt_remain),
+          [mask1] "w"(vmask_rp1),
+          [mask2] "w"(vmask_rp2),
+          [wmask] "w"(wmask),
+          [vbias] "w"(wbias)
+        : "cc",
+          "memory",
+          "v0",
+          "v1",
+          "v2",
+          "v3",
+          "v4",
+          "v5",
+          "v6",
+          "v7",
+          "v8",
+          "v9",
+          "v10",
+          "v11",
+          "v12",
+          "v13",
+          "v14",
+          "v15",
+          "v16",
+          "v17",
+          "v18",
+          "v19",
+          "v20",
+          "v21");
+  }
+}
+#endif
 /**
  * \brief depthwise convolution kernel 3x3, stride 2
  */
@@ -1342,13 +1824,13 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
                                  const float* weights,
                                  const float* bias,
                                  bool flag_bias,
-                                 bool flag_relu,
                                  const int num,
                                  const int ch_in,
                                  const int h_in,
                                  const int w_in,
                                  const int h_out,
                                  const int w_out,
+                                 const operators::ActivationParam act_param,
                                  ARMContext* ctx) {
   int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
   int out_pad_idx[4] = {0, 1, 2, 3};
@@ -1356,7 +1838,14 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
   int tile_w = w_out >> 2;
   int cnt_remain = w_out % 4;
 
-  unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
+  unsigned int size_right_remain = (unsigned int)(8 + (tile_w << 3) - w_in);
+  size_right_remain = 8 - size_right_remain;
+
+  if (cnt_remain == 0 && size_right_remain == 0) {
+    cnt_remain = 4;
+    tile_w -= 1;
+    size_right_remain = 8;
+  }
 
   uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
                                    vld1q_s32(right_pad_idx));  // 0 2 4 6
@@ -1461,117 +1950,24 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
           doutr1_ptr = write_ptr;
         }
         int cnt = tile_w;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S2
-              "ld1 {v15.4s}, [%[inptr0]]                 \n"
-              "ld1 {v18.4s}, [%[inptr1]]                 \n"
-              "ld1 {v19.4s}, [%[inptr2]]                 \n"
-              "ld1 {v20.4s}, [%[inptr3]]                 \n"
-              "ld1 {v21.4s}, [%[inptr4]]                 \n"
-              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-              MID_COMPUTE_S2 MID_RESULT_S2_RELU
-              "cmp %w[remain], #1                           \n"
-              "blt 4f                                     \n" RIGHT_COMPUTE_S2
-                  RIGHT_RESULT_S2_RELU
-              "4:                                          \n"
-              : [inptr0] "+r"(din0_ptr),
-                [inptr1] "+r"(din1_ptr),
-                [inptr2] "+r"(din2_ptr),
-                [inptr3] "+r"(din3_ptr),
-                [inptr4] "+r"(din4_ptr),
-                [outptr0] "+r"(doutr0_ptr),
-                [outptr1] "+r"(doutr1_ptr),
-                [cnt] "+r"(cnt)
-              : [vzero] "w"(vzero),
-                [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [remain] "r"(cnt_remain),
-                [mask1] "w"(vmask_rp1),
-                [mask2] "w"(vmask_rp2),
-                [wmask] "w"(wmask),
-                [vbias] "w"(wbias)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21");
-        } else {
-          asm volatile(
-              INIT_S2
-              "ld1 {v15.4s}, [%[inptr0]]                 \n"
-              "ld1 {v18.4s}, [%[inptr1]]                 \n"
-              "ld1 {v19.4s}, [%[inptr2]]                 \n"
-              "ld1 {v20.4s}, [%[inptr3]]                 \n"
-              "ld1 {v21.4s}, [%[inptr4]]                 \n"
-              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-              MID_COMPUTE_S2 MID_RESULT_S2
-              "cmp %w[remain], #1                           \n"
-              "blt 4f                                     \n" RIGHT_COMPUTE_S2
-                  RIGHT_RESULT_S2
-              "4:                                          \n"
-              : [inptr0] "+r"(din0_ptr),
-                [inptr1] "+r"(din1_ptr),
-                [inptr2] "+r"(din2_ptr),
-                [inptr3] "+r"(din3_ptr),
-                [inptr4] "+r"(din4_ptr),
-                [outptr0] "+r"(doutr0_ptr),
-                [outptr1] "+r"(doutr1_ptr),
-                [cnt] "+r"(cnt)
-              : [vzero] "w"(vzero),
-                [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [remain] "r"(cnt_remain),
-                [mask1] "w"(vmask_rp1),
-                [mask2] "w"(vmask_rp2),
-                [wmask] "w"(wmask),
-                [vbias] "w"(wbias)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21");
-        }
+        act_switch_3x3s2p0(din0_ptr,
+                           din1_ptr,
+                           din2_ptr,
+                           din3_ptr,
+                           din4_ptr,
+                           doutr0_ptr,
+                           doutr1_ptr,
+                           wr0,
+                           wr1,
+                           wr2,
+                           vmask_rp1,
+                           vmask_rp2,
+                           wmask,
+                           wbias,
+                           vzero,
+                           cnt,
+                           cnt_remain,
+                           act_param);
         doutr0 = doutr0 + 2 * w_out;
       }
 #else
@@ -1599,64 +1995,36 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
         }
         int cnt = tile_w;
         unsigned int* mask_ptr = dmask;
-        if (flag_relu) {
-          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU
-                           RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [outptr] "+r"(doutr0_ptr),
-                         [cnt] "+r"(cnt),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [remain] "r"(cnt_remain),
-                         [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
-                           RIGHT_RESULT_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [outptr] "+r"(doutr0_ptr),
-                         [cnt] "+r"(cnt),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [remain] "r"(cnt_remain),
-                         [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
+        asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
+                         RIGHT_RESULT_S2
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [outptr] "+r"(doutr0_ptr),
+                       [cnt] "+r"(cnt),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [remain] "r"(cnt_remain),
+                       [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "r"(bias_c)
+                     : "cc",
+                       "memory",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        if (act_param.has_active) {
+          act_switch_process(doutr0, doutr0, w_out, &act_param);
         }
         doutr0 = doutr0 + w_out;
       }
@@ -1673,13 +2041,13 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
                                    const float* weights,
                                    const float* bias,
                                    bool flag_bias,
-                                   bool flag_relu,
                                    const int num,
                                    const int ch_in,
                                    const int h_in,
                                    const int w_in,
                                    const int h_out,
                                    const int w_out,
+                                   const operators::ActivationParam act_param,
                                    ARMContext* ctx) {
   int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
   int out_pad_idx[4] = {0, 1, 2, 3};
@@ -1741,114 +2109,62 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
 
         unsigned int* mask_ptr = dmask;
 #ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
-                         [out] "r"(out_buf)
-                       : "cc",
-                         "memory",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16");
-        } else {
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
-                         [out] "r"(out_buf)
-                       : "cc",
-                         "memory",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16");
-        }
+        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "w"(vbias),
+                       [out] "r"(out_buf)
+                     : "cc",
+                       "memory",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16");
+
 #else
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c),
-                         [out] "r"(out_buf),
-                         [mask_ptr] "r"(dmask)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c),
-                         [out] "r"(out_buf),
-                         [mask_ptr] "r"(dmask)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
+        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias] "r"(bias_c),
+                       [out] "r"(out_buf),
+                       [mask_ptr] "r"(dmask)
+                     : "cc",
+                       "memory",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
 #endif
+        if (act_param.has_active) {
+          act_switch_process(out_buf, out_buf, w_out, &act_param);
+        }
         for (int w = 0; w < w_out; ++w) {
           *dout_channel++ = out_buf[w];
         }
diff --git a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4617d40f4372f6589f20b50205fb307cdc705808
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
@@ -0,0 +1,721 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+#ifdef __aarch64__
+#define COMPUTE                                                 \
+  "ldr    q8, [%[bias]]\n"            /* load bias */           \
+  "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/        \
+  "and    v19.16b,  v8.16b, v8.16b\n"                           \
+  "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/        \
+  "and    v20.16b,  v8.16b, v8.16b\n"                           \
+  "ldp    q4, q5,   [%[inr0]], #32\n" /* load input r0*/        \
+  "and    v21.16b,  v8.16b, v8.16b\n"                           \
+  "ldp    q6, q7,   [%[inr0]], #32\n" /* load input r0*/        \
+  "and    v22.16b,  v8.16b, v8.16b\n"                           \
+  "ldr    q8,       [%[inr0]]\n"        /* load input r0*/      \
+  "fmla   v19.4s ,  %[w0].4s,  v0.4s\n" /* outr0 = w0 * r0, 0*/ \
+  "fmla   v20.4s ,  %[w0].4s,  v2.4s\n" /* outr1 = w0 * r0, 2*/ \
+  "fmla   v21.4s ,  %[w0].4s,  v4.4s\n" /* outr2 = w0 * r0, 4*/ \
+  "fmla   v22.4s ,  %[w0].4s,  v6.4s\n" /* outr3 = w0 * r0, 6*/ \
+  "fmla   v19.4s ,  %[w1].4s,  v1.4s\n" /* outr0 = w1 * r0, 1*/ \
+  "ldp    q0, q1,   [%[inr1]], #32\n"   /* load input r1*/      \
+  "fmla   v20.4s ,  %[w1].4s,  v3.4s\n" /* outr1 = w1 * r0, 3*/ \
+  "fmla   v21.4s ,  %[w1].4s,  v5.4s\n" /* outr2 = w1 * r0, 5*/ \
+  "fmla   v22.4s ,  %[w1].4s,  v7.4s\n" /* outr3 = w1 * r0, 7*/ \
+  "fmla   v19.4s ,  %[w2].4s,  v2.4s\n" /* outr0 = w0 * r0, 2*/ \
+  "ldp    q2, q3,   [%[inr1]], #32\n"   /* load input r1*/      \
+  "fmla   v20.4s ,  %[w2].4s,  v4.4s\n" /* outr1 = w0 * r0, 4*/ \
+  "ldp    q4, q5,   [%[inr1]], #32\n"   /* load input r1*/      \
+  "fmla   v21.4s ,  %[w2].4s,  v6.4s\n" /* outr2 = w0 * r0, 6*/ \
+  "ldp    q6, q7,   [%[inr1]], #32\n"   /* load input r1*/      \
+  "fmla   v22.4s ,  %[w2].4s,  v8.4s\n" /* outr3 = w0 * r0, 8*/ \
+  "ldr    q8,   [%[inr1]]\n"            /* load input r1*/      \
+  "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr0 = w3 * r1, 0*/ \
+  "fmla   v20.4s ,  %[w3].4s,  v2.4s\n" /* outr1 = w3 * r1, 2*/ \
+  "fmla   v21.4s ,  %[w3].4s,  v4.4s\n" /* outr2 = w3 * r1, 4*/ \
+  "fmla   v22.4s ,  %[w3].4s,  v6.4s\n" /* outr3 = w3 * r1, 6*/ \
+  "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr0 = w4 * r1, 1*/ \
+  "ldp    q0, q1,   [%[inr2]], #32\n"   /* load input r2*/      \
+  "fmla   v20.4s ,  %[w4].4s,  v3.4s\n" /* outr1 = w4 * r1, 3*/ \
+  "fmla   v21.4s ,  %[w4].4s,  v5.4s\n" /* outr2 = w4 * r1, 5*/ \
+  "fmla   v22.4s ,  %[w4].4s,  v7.4s\n" /* outr3 = w4 * r1, 7*/ \
+  "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr0 = w5 * r1, 2*/ \
+  "ldp    q2, q3,   [%[inr2]], #32\n"   /* load input r2*/      \
+  "fmla   v20.4s ,  %[w5].4s,  v4.4s\n" /* outr1 = w5 * r1, 4*/ \
+  "ldp    q4, q5,   [%[inr2]], #32\n"   /* load input r2*/      \
+  "fmla   v21.4s ,  %[w5].4s,  v6.4s\n" /* outr2 = w5 * r1, 6*/ \
+  "ldp    q6, q7,   [%[inr2]], #32\n"   /* load input r2*/      \
+  "fmla   v22.4s ,  %[w5].4s,  v8.4s\n" /* outr3 = w5 * r1, 8*/ \
+  "ldr    q8,   [%[inr2]]\n"            /* load input r2*/      \
+  "fmla   v19.4s ,  %[w6].4s,  v0.4s\n" /* outr0 = w6 * r2, 0*/ \
+  "fmla   v20.4s ,  %[w6].4s,  v2.4s\n" /* outr1 = w6 * r2, 2*/ \
+  "fmla   v21.4s ,  %[w6].4s,  v4.4s\n" /* outr2 = w6 * r2, 4*/ \
+  "fmla   v22.4s ,  %[w6].4s,  v6.4s\n" /* outr3 = w6 * r2, 6*/ \
+  "fmla   v19.4s ,  %[w7].4s,  v1.4s\n" /* outr0 = w7 * r2, 1*/ \
+  "fmla   v20.4s ,  %[w7].4s,  v3.4s\n" /* outr1 = w7 * r2, 3*/ \
+  "fmla   v21.4s ,  %[w7].4s,  v5.4s\n" /* outr2 = w7 * r2, 5*/ \
+  "fmla   v22.4s ,  %[w7].4s,  v7.4s\n" /* outr3 = w7 * r2, 7*/ \
+  "fmla   v19.4s ,  %[w8].4s,  v2.4s\n" /* outr0 = w8 * r2, 2*/ \
+  "fmla   v20.4s ,  %[w8].4s,  v4.4s\n" /* outr1 = w8 * r2, 4*/ \
+  "fmla   v21.4s ,  %[w8].4s,  v6.4s\n" /* outr2 = w8 * r2, 6*/ \
+  "fmla   v22.4s ,  %[w8].4s,  v8.4s\n" /* outr3 = w8 * r2, 8*/ \
+  "trn1 v0.4s, v19.4s, v20.4s\n"        /* r0: a0a1c0c1*/       \
+  "trn2 v1.4s, v19.4s, v20.4s\n"        /* r0: b0b1d0d1*/       \
+  "trn1 v2.4s, v21.4s, v22.4s\n"        /* r0: a2a3c2c3*/       \
+  "trn2 v3.4s, v21.4s, v22.4s\n"        /* r0: b2b3d2d3*/       \
+  "trn1 v19.2d, v0.2d, v2.2d\n"         /* r0: a0a1a2a3*/       \
+  "trn2 v21.2d, v0.2d, v2.2d\n"         /* r0: c0c1c2c3*/       \
+  "trn1 v20.2d, v1.2d, v3.2d\n"         /* r0: b0b1b2b3*/       \
+  "trn2 v22.2d, v1.2d, v3.2d\n"         /* r0: d0d1d2d3*/
+#define RELU                            /* relu */     \
+  "movi v0.4s, #0\n"                    /* for relu */ \
+  "fmax v19.4s, v19.4s, v0.4s\n"                       \
+  "fmax v20.4s, v20.4s, v0.4s\n"                       \
+  "fmax v21.4s, v21.4s, v0.4s\n"                       \
+  "fmax v22.4s, v22.4s, v0.4s\n"
+#define RELU6 /* relu6 */             \
+  "fmin v19.4s, v19.4s, %[vsix].4s\n" \
+  "fmin v20.4s, v20.4s, %[vsix].4s\n" \
+  "fmin v21.4s, v21.4s, %[vsix].4s\n" \
+  "fmin v22.4s, v22.4s, %[vsix].4s\n"
+#define LEAKY_RELU                      /* LeakyRelu */ \
+  "movi v0.4s, #0\n"                    /* for relu */  \
+  "fcmge v1.4s, v19.4s,  v0.4s \n"      /* vcgeq_u32 */ \
+  "fmul v2.4s, v19.4s, %[vscale].4s \n" /* mul */       \
+  "fcmge v3.4s, v20.4s,  v0.4s \n"      /* vcgeq_u32 */ \
+  "fmul v4.4s, v20.4s, %[vscale].4s \n" /* mul */       \
+  "fcmge v5.4s, v21.4s,  v0.4s \n"      /* vcgeq_u32 */ \
+  "fmul v6.4s, v21.4s, %[vscale].4s \n" /* mul */       \
+  "fcmge v7.4s, v22.4s,  v0.4s \n"      /* vcgeq_u32 */ \
+  "fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */       \
+  "bif  v19.16b, v2.16b, v1.16b \n"     /* choose*/     \
+  "bif  v19.16b, v4.16b, v3.16b \n"     /* choose*/     \
+  "bif  v19.16b, v6.16b, v5.16b \n"     /* choose*/     \
+  "bif  v19.16b, v8.16b, v7.16b \n"     /* choose*/
+#define STORE                           /* save result */ \
+  "str q19, [%[outc0]], #16\n"                            \
+  "str q20, [%[outc1]], #16\n"                            \
+  "str q21, [%[outc2]], #16\n"                            \
+  "str q22, [%[outc3]], #16\n"
+
+#else
+#define COMPUTE                                                                \
+  /* fill with bias */                                                         \
+  "vld1.32  {d16-d17}, [%[bias]]\n" /* load bias */ /* load weights */         \
+  "vld1.32    {d18-d21}, [%[wc0]]!\n"               /* load w0-2, to q9-11 */  \
+  "vld1.32  {d0-d3},   [%[r0]]!\n"                  /* load input r0, 0,1*/    \
+  "vand.i32 q12,  q8, q8\n"                                                    \
+  "vld1.32  {d4-d7},   [%[r0]]!\n" /* load input r0, 2,3*/                     \
+  "vand.i32 q13,  q8, q8\n"                                                    \
+  "vld1.32  {d8-d11},  [%[r0]]!\n" /* load input r0, 4,5*/                     \
+  "vand.i32 q14,  q8, q8\n"                                                    \
+  "vld1.32  {d12-d15}, [%[r0]]!\n" /* load input r0, 6,7*/                     \
+  "vand.i32 q15,  q8, q8\n"                                                    \
+  "vld1.32  {d16-d17}, [%[r0]]\n" /* load input r0, 8*/                        \
+  "vmla.f32   q12, q9, q0               @ w0 * inr0\n"                         \
+  "vmla.f32   q13, q9, q2               @ w0 * inr2\n"                         \
+  "vld1.32    {d22-d23}, [%[wc0]]!\n" /* load w2, to q11 */                    \
+  "vmla.f32   q14, q9, q4               @ w0 * inr4\n"                         \
+  "vmla.f32   q15, q9, q6               @ w0 * inr6\n"                         \
+  "vmla.f32   q12, q10, q1              @ w1 * inr1\n"                         \
+  "vld1.32    {d0-d3}, [%[r1]]!         @ load r1, 0, 1\n"                     \
+  "vmla.f32   q13, q10, q3              @ w1 * inr3\n"                         \
+  "vmla.f32   q14, q10, q5              @ w1 * inr5\n"                         \
+  "vmla.f32   q15, q10, q7              @ w1 * inr7\n"                         \
+  "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w3-4, to q9-10 */                \
+  "vmla.f32   q12, q11, q2              @ w2 * inr2\n"                         \
+  "vld1.32    {d4-d7}, [%[r1]]!         @ load r1, 2, 3\n"                     \
+  "vmla.f32   q13, q11, q4              @ w2 * inr4\n"                         \
+  "vld1.32    {d8-d11}, [%[r1]]!        @ load r1, 4, 5\n"                     \
+  "vmla.f32   q14, q11, q6              @ w2 * inr6\n"                         \
+  "vld1.32    {d12-d15}, [%[r1]]!       @ load r1, 6, 7\n"                     \
+  "vmla.f32   q15, q11, q8              @ w2 * inr8\n" /* mul r1 with w3, w4*/ \
+  "vmla.f32   q12, q9, q0               @ w3 * inr0\n"                         \
+  "vmla.f32   q13, q9, q2               @ w3 * inr2\n"                         \
+  "vld1.32    {d22-d23}, [%[wc0]]!\n" /* load w5, to q11 */                    \
+  "vmla.f32   q14, q9, q4               @ w3 * inr4\n"                         \
+  "vmla.f32   q15, q9, q6               @ w3 * inr6\n"                         \
+  "vld1.32    {d16-d17}, [%[r1]]\n" /* load input r1, 8*/                      \
+  "vmla.f32   q12, q10, q1              @ w4 * inr1\n"                         \
+  "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 0, 1\n"                     \
+  "vmla.f32   q13, q10, q3              @ w4 * inr3\n"                         \
+  "vmla.f32   q14, q10, q5              @ w4 * inr5\n"                         \
+  "vmla.f32   q15, q10, q7              @ w4 * inr7\n"                         \
+  "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w6-7, to q9-10 */                \
+  "vmla.f32   q12, q11, q2              @ w5 * inr2\n"                         \
+  "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, 2, 3\n"                     \
+  "vmla.f32   q13, q11, q4              @ w5 * inr4\n"                         \
+  "vld1.32    {d8-d11}, [%[r2]]!        @ load r2, 4, 5\n"                     \
+  "vmla.f32   q14, q11, q6              @ w5 * inr6\n"                         \
+  "vld1.32    {d12-d15}, [%[r2]]!       @ load r2, 6, 7\n"                     \
+  "vmla.f32   q15, q11, q8              @ w5 * inr8\n" /* mul r2 with w6, w7*/ \
+  "vmla.f32   q12, q9, q0               @ w6 * inr0\n"                         \
+  "vmla.f32   q13, q9, q2               @ w6 * inr2\n"                         \
+  "vld1.32    {d22-d23}, [%[wc0]]!\n" /* load w8, to q11 */                    \
+  "vmla.f32   q14, q9, q4               @ w6 * inr4\n"                         \
+  "vmla.f32   q15, q9, q6               @ w6 * inr6\n"                         \
+  "vld1.32    {d16-d17}, [%[r2]]\n" /* load input r2, 8*/                      \
+  "vmla.f32   q12, q10, q1              @ w7 * inr1\n"                         \
+  "vmla.f32   q13, q10, q3              @ w7 * inr3\n"                         \
+  "vmla.f32   q14, q10, q5              @ w7 * inr5\n"                         \
+  "vmla.f32   q15, q10, q7              @ w7 * inr7\n"                         \
+  "sub    %[wc0], %[wc0], #144          @ wc0 - 144 to start address\n"        \
+  "vmla.f32   q12, q11, q2              @ w8 * inr2\n"                         \
+  "vmla.f32   q13, q11, q4              @ w8 * inr4\n"                         \
+  "vmla.f32   q14, q11, q6              @ w8 * inr6\n"                         \
+  "vmla.f32   q15, q11, q8              @ w8 * inr8\n" /* transpose */         \
+  "vtrn.32 q12, q13\n"                                 /* a0a1c0c1, b0b1d0d1*/ \
+  "vtrn.32 q14, q15\n"                                 /* a2a3c2c3, b2b3d2d3*/ \
+  "vswp   d25, d28\n"                                  /* a0a1a2a3, c0c1c2c3*/ \
+  "vswp   d27, d30\n"                                  /* b0b1b2b3, d0d1d2d3*/
+#define RELU                                           /* relu */ \
+  "vmov.u32 q0, #0\n"                                             \
+  "vld1.32 {d2-d3}, [%[six_ptr]]\n"                               \
+  "vmax.f32 q12, q12, q0\n"                                       \
+  "vmax.f32 q13, q13, q0\n"                                       \
+  "vmax.f32 q14, q14, q0\n"                                       \
+  "vmax.f32 q15, q15, q0\n"
+#define RELU6 /* relu6 */   \
+  "vmin.f32 q12, q12, q1\n" \
+  "vmin.f32 q13, q13, q1\n" \
+  "vmin.f32 q14, q14, q1\n" \
+  "vmin.f32 q15, q15, q1\n"
+#define LEAKY_RELU /* LeakyRelu */    \
+  "vmov.u32 q0, #0\n"                 \
+  "vld1.32 {d2-d3}, [%[scale_ptr]]\n" \
+  "vcge.f32 q2, q12, q0  @ q0 > 0 \n" \
+  "vcge.f32 q4, q13, q0  @ q0 > 0 \n" \
+  "vcge.f32 q6, q14, q0  @ q0 > 0 \n" \
+  "vcge.f32 q8, q15, q0  @ q0 > 0 \n" \
+  "vmul.f32 q3, q12, q1   @ mul \n"   \
+  "vmul.f32 q5, q13, q1   @ mul \n"   \
+  "vmul.f32 q7, q14, q1   @ mul \n"   \
+  "vmul.f32 q9, q15, q1   @ mul \n"   \
+  "vbif q12, q3, q2 @ choose \n"      \
+  "vbif q13, q5, q4 @ choose \n"      \
+  "vbif q14, q7, q6 @ choose \n"      \
+  "vbif q15, q9, q8 @ choose \n"
+#define STORE                        /* save result */ \
+  "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/   \
+  "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/   \
+  "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/   \
+  "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
+
+#endif
+
+void act_switch_3x3s2(const float* inr0,
+                      const float* inr1,
+                      const float* inr2,
+                      float* outc0,
+                      float* outc1,
+                      float* outc2,
+                      float* outc3,
+                      const float* weight_c,
+                      float* bias_local,
+                      float32x4_t w0,
+                      float32x4_t w1,
+                      float32x4_t w2,
+                      float32x4_t w3,
+                      float32x4_t w4,
+                      float32x4_t w5,
+                      float32x4_t w6,
+                      float32x4_t w7,
+                      float32x4_t w8,
+                      const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+#ifdef __aarch64__
+    float32x4_t vsix = vdupq_n_f32(tmp);
+    float32x4_t vscale = vdupq_n_f32(ss);
+#else
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+#endif
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE RELU STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [bias] "r"(bias_local)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+#else
+        asm volatile(COMPUTE RELU STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [bias] "r"(bias_local), [six_ptr] "r"(vsix)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kRelu6:
+#ifdef __aarch64__
+        asm volatile(COMPUTE RELU RELU6 STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [bias] "r"(bias_local),
+                       [vsix] "w"(vsix)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+#else
+        asm volatile(COMPUTE RELU RELU6 STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [bias] "r"(bias_local), [six_ptr] "r"(vsix)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE LEAKY_RELU STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [bias] "r"(bias_local),
+                       [vscale] "w"(vscale)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+#else
+        asm volatile(COMPUTE LEAKY_RELU STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [bias] "r"(bias_local), [scale_ptr] "r"(vscale)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(COMPUTE STORE
+                 : [inr0] "+r"(inr0),
+                   [inr1] "+r"(inr1),
+                   [inr2] "+r"(inr2),
+                   [outc0] "+r"(outc0),
+                   [outc1] "+r"(outc1),
+                   [outc2] "+r"(outc2),
+                   [outc3] "+r"(outc3)
+                 : [w0] "w"(w0),
+                   [w1] "w"(w1),
+                   [w2] "w"(w2),
+                   [w3] "w"(w3),
+                   [w4] "w"(w4),
+                   [w5] "w"(w5),
+                   [w6] "w"(w6),
+                   [w7] "w"(w7),
+                   [w8] "w"(w8),
+                   [bias] "r"(bias_local)
+                 : "cc",
+                   "memory",
+                   "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v19",
+                   "v20",
+                   "v21",
+                   "v22");
+#else
+    asm volatile(COMPUTE STORE
+                 : [r0] "+r"(inr0),
+                   [r1] "+r"(inr1),
+                   [r2] "+r"(inr2),
+                   [wc0] "+r"(weight_c),
+                   [outc0] "+r"(outc0),
+                   [outc1] "+r"(outc1),
+                   [outc2] "+r"(outc2),
+                   [outc3] "+r"(outc3)
+                 : [bias] "r"(bias_local)
+                 : "cc",
+                   "memory",
+                   "q0",
+                   "q1",
+                   "q2",
+                   "q3",
+                   "q4",
+                   "q5",
+                   "q6",
+                   "q7",
+                   "q8",
+                   "q9",
+                   "q10",
+                   "q11",
+                   "q12",
+                   "q13",
+                   "q14",
+                   "q15");
+#endif
+  }
+}
+
+void conv_3x3s2_depthwise_fp32(const float* i_data,
+                               float* o_data,
+                               int bs,
+                               int oc,
+                               int oh,
+                               int ow,
+                               int ic,
+                               int ih,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
+                               ARMContext* ctx) {
+  auto paddings = *param.paddings;
+  int threads = ctx->threads();
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
+  const int out_c_block = 4;
+  const int out_h_kernel = 1;
+  const int out_w_kernel = 4;
+  const int win_ext = ow * 2 + 1;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh * 2 + 1;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size = threads * prein_size + win_round + ow_round;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  auto ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
+#else
+      float* pre_din = ptr_write + ow_round;
+#endif
+      /// const array size
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
+      }
+#ifdef __aarch64__
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
+      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
+      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
+      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc0 = dout_c00 + h * ow;
+        float* outc1 = outc0 + size_out_channel;
+        float* outc2 = outc1 + size_out_channel;
+        float* outc3 = outc2 + size_out_channel;
+        const float* inr0 = pre_din + h * 2 * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
+            case 3:
+              outc1 = ptr_write;
+            case 2:
+              outc2 = ptr_write;
+            case 1:
+              outc3 = ptr_write;
+            default:
+              break;
+          }
+        }
+        auto c0 = outc0;
+        auto c1 = outc1;
+        auto c2 = outc2;
+        auto c3 = outc3;
+        float pre_out[16];
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          if (flag_mask) {
+            c0 = outc0;
+            c1 = outc1;
+            c2 = outc2;
+            c3 = outc3;
+            outc0 = pre_out;
+            outc1 = pre_out + 4;
+            outc2 = pre_out + 8;
+            outc3 = pre_out + 12;
+          }
+#ifdef __aarch64__
+          act_switch_3x3s2(inr0,
+                           inr1,
+                           inr2,
+                           outc0,
+                           outc1,
+                           outc2,
+                           outc3,
+                           weight_c,
+                           bias_local,
+                           w0,
+                           w1,
+                           w2,
+                           w3,
+                           w4,
+                           w5,
+                           w6,
+                           w7,
+                           w8,
+                           act_param);
+#else
+          act_switch_3x3s2(inr0,
+                           inr1,
+                           inr2,
+                           outc0,
+                           outc1,
+                           outc2,
+                           outc3,
+                           weight_c,
+                           bias_local,
+                           vzero,
+                           vzero,
+                           vzero,
+                           vzero,
+                           vzero,
+                           vzero,
+                           vzero,
+                           vzero,
+                           vzero,
+                           act_param);
+#endif
+          if (flag_mask) {
+            for (int i = 0; i < remain; ++i) {
+              c0[i] = pre_out[i];
+              c1[i] = pre_out[i + 4];
+              c2[i] = pre_out[i + 8];
+              c3[i] = pre_out[i + 12];
+            }
+          }
+          inr0 += 32;
+          inr1 += 32;
+          inr2 += 32;
+          outc0 += 4;
+          outc1 += 4;
+          outc2 += 4;
+          outc3 += 4;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc b/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc
index 1a2e42e0a9ca4193be84a21247112de8cdc144a1..6125547b8ba611d016d5d85359a4138b0ede7607 100644
--- a/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc
@@ -13,9602 +13,750 @@
 // limitations under the License.
 
 #include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
 #include "lite/backends/arm/math/conv_depthwise.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
 
 namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
 
-//!    weights layout
-//!            *-----------------------*-----*
-//!    w0  <-- | W0    W1    W2    W3  | W4  |
-//!            *-----------------------*     |
-//!    w1  <-- | W5    W6    W7    W8  | W9  |
-//!            *-----------------------*     | -->  w5
-//!    w2  <-- | W10   W11   W12   W13 | W14 |
-//!            *-----------------------*     |
-//!    w3  <-- | W15   W16   W17   W18 | W19 |
-//!            *-----------------------*-----*
-//!    w4  <-- | W20   W21   W22   W23 | W24 | -->  w6[0]
-//!            *-----------------------*-----*
-
-void conv_depthwise_5x5s1_impl(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int pad,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx);
-
-void conv_depthwise_5x5s1_small_impl(const float* din,
-                                     float* dout,
-                                     int num,
-                                     int ch_out,
-                                     int h_out,
-                                     int w_out,
-                                     int ch_in,
-                                     int h_in,
-                                     int w_in,
-                                     const float* weights,
-                                     const float* bias,
-                                     int pad,
-                                     bool flag_bias,
-                                     bool flag_relu,
-                                     ARMContext* ctx);
-
-void conv_depthwise_5x5s1_relu_impl(const float* din,
-                                    float* dout,
-                                    int num,
-                                    int ch_out,
-                                    int h_out,
-                                    int w_out,
-                                    int ch_in,
-                                    int h_in,
-                                    int w_in,
-                                    const float* weights,
-                                    const float* bias,
-                                    int pad,
-                                    bool flag_bias,
-                                    bool flag_relu,
-                                    ARMContext* ctx);
-
-void conv_depthwise_5x5s1_small_relu_impl(const float* din,
-                                          float* dout,
-                                          int num,
-                                          int ch_out,
-                                          int h_out,
-                                          int w_out,
-                                          int ch_in,
-                                          int h_in,
-                                          int w_in,
-                                          const float* weights,
-                                          const float* bias,
-                                          int pad,
-                                          bool flag_bias,
-                                          bool flag_relu,
-                                          ARMContext* ctx);
-
-static float* prepad_input(
-    const float* input, int num, int ch_in, int h_in, int w_in, int pad) {
-  int h_new = h_in + 2 * pad;
-  int w_new = w_in + 2 * pad;
-  float* new_input =
-      static_cast<float*>(malloc(h_new * w_new * ch_in * num * sizeof(float)));
-  float* new_input_ptr = new_input;
-  for (int c = 0; c < num * ch_in; ++c) {
-    memset(new_input_ptr, 0x00, w_new * pad * sizeof(float));
-    new_input_ptr += w_new * pad;
-    for (int i = 0; i < h_in; ++i) {
-      memset(new_input_ptr, 0x00, pad * sizeof(float));
-      new_input_ptr += pad;
-      memcpy(new_input_ptr, input, w_in * sizeof(float));
-      new_input_ptr += w_in;
-      input += w_in;
-      memset(new_input_ptr, 0x00, pad * sizeof(float));
-      new_input_ptr += pad;
-    }
-    memset(new_input_ptr, 0x00, w_new * pad * sizeof(float));
-    new_input_ptr += w_new * pad;
-  }
-  return new_input;
-}
-
-#ifdef __aarch64__
-
-//! kernel for one out without extracting data mid
-//! deal with four lines out
-void compute_one_out_without_extract(const float* din0,
-                                     const float* din1,
-                                     const float* din2,
-                                     const float* din3,
-                                     const float* din4,
-                                     const float* din5,
-                                     const float* din6,
-                                     const float* din7,
-                                     float* dout0,
-                                     float* dout1,
-                                     float* dout2,
-                                     float* dout3,
-                                     float32x4_t w0,
-                                     float32x4_t w1,
-                                     float32x4_t w2,
-                                     float32x4_t w3,
-                                     float32x4_t w4,
-                                     float32x4_t w5,
-                                     float32x4_t w6,
-                                     const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! din0 - din7: 5   v20, v21
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      "ld1 {v20.s}[0], [%[din0]]  \n"
-      "ld1 {v21.s}[0], [%[din4]]  \n"
-      "ld1 {v20.s}[1], [%[din1]]  \n"
-      "ld1 {v21.s}[1], [%[din5]]  \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      "ld1 {v20.s}[2], [%[din2]]  \n"
-      "ld1 {v21.s}[2], [%[din6]]  \n"
-      "ld1 {v20.s}[3], [%[din3]]  \n"
-      "ld1 {v21.s}[3], [%[din7]]  \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // ext
-      "ext v22.16b, v20.16b, v21.16b, #4  \n"  // 1 2 3 4
-      "ext v23.16b, v20.16b, v21.16b, #8  \n"  // 2 3 4 5
-      "ext v24.16b, v20.16b, v21.16b, #12 \n"  // 3 4 5 6
-
-      // in col5
-      "fmla v16.4s, %[w5].4s, v20.4s  \n"
-      "fmla v17.4s, %[w5].4s, v22.4s  \n"
-      "fmla v18.4s, %[w5].4s, v23.4s  \n"
-      "fmla v19.4s, %[w5].4s, v24.4s  \n"
-
-      "ld1 {v31.4s}, [%[bias]] \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s  \n"
-      "faddp v26.4s, v18.4s, v19.4s  \n"
-      "faddp v25.4s, v25.4s, v26.4s  \n"
-
-      // in[24] * w6[0]
-      "fmla v25.4s, v21.4s, %[w6].s[0]\n"
-      "fadd v25.4s, v25.4s, v31.4s    \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [w5] "w"(w5),
-        [w6] "w"(w6),
-        [bias] "r"(bias)
-      : "memory",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v22",
-        "v23",
-        "v24",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for one out without extracting data mid
-//! deal with four lines out
-void compute_one_out_without_extract_relu(const float* din0,
-                                          const float* din1,
-                                          const float* din2,
-                                          const float* din3,
-                                          const float* din4,
-                                          const float* din5,
-                                          const float* din6,
-                                          const float* din7,
-                                          float* dout0,
-                                          float* dout1,
-                                          float* dout2,
-                                          float* dout3,
-                                          float32x4_t w0,
-                                          float32x4_t w1,
-                                          float32x4_t w2,
-                                          float32x4_t w3,
-                                          float32x4_t w4,
-                                          float32x4_t w5,
-                                          float32x4_t w6,
-                                          const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! din0 - din7: 5   v20, v21
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "ld1 {v8.4s}, [%[din0]], #16   \n"
-      "ld1 {v9.4s}, [%[din1]], #16   \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      "ld1 {v20.s}[0], [%[din0]]  \n"
-      "ld1 {v21.s}[0], [%[din4]]  \n"
-      "ld1 {v20.s}[1], [%[din1]]  \n"
-      "ld1 {v21.s}[1], [%[din5]]  \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      "ld1 {v20.s}[2], [%[din2]]  \n"
-      "ld1 {v21.s}[2], [%[din6]]  \n"
-      "ld1 {v20.s}[3], [%[din3]]  \n"
-      "ld1 {v21.s}[3], [%[din7]]  \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // ext
-      "ext v22.16b, v20.16b, v21.16b, #4  \n"  // 1 2 3 4
-      "ext v23.16b, v20.16b, v21.16b, #8  \n"  // 2 3 4 5
-      "ext v24.16b, v20.16b, v21.16b, #12 \n"  // 3 4 5 6
-
-      // in col5
-      "fmla v16.4s, %[w5].4s, v20.4s  \n"
-      "fmla v17.4s, %[w5].4s, v22.4s  \n"
-      "fmla v18.4s, %[w5].4s, v23.4s  \n"
-      "fmla v19.4s, %[w5].4s, v24.4s  \n"
-
-      "ld1 {v31.4s}, [%[bias]] \n"
-      "movi v30.4s, #0  \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s  \n"
-      "faddp v26.4s, v18.4s, v19.4s  \n"
-      "faddp v25.4s, v25.4s, v26.4s  \n"
-
-      // in[24] * w6[0]
-      "fmla v25.4s, v21.4s, %[w6].s[0] \n"
-      "fadd v25.4s, v25.4s, v31.4s     \n"
-      "fmax v25.4s, v25.4s, v30.4s     \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [w5] "w"(w5),
-        [w6] "w"(w6),
-        [bias] "r"(bias)
-      : "memory",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v22",
-        "v23",
-        "v24",
-        "v25",
-        "v26",
-        "v30",
-        "v31");
-}
-
-//! kernel for one out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_one_out_extract_pre(const float* din0,
-                                 const float* din1,
-                                 const float* din2,
-                                 const float* din3,
-                                 const float* din4,
-                                 const float* din5,
-                                 const float* din6,
-                                 const float* din7,
-                                 float* dout0,
-                                 float* dout1,
-                                 float* dout2,
-                                 float* dout3,
-                                 const float* weights,
-                                 const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"
-      "ldr q1, [%[wh]], #20  \n"
-      "ldr q2, [%[wh]], #20  \n"
-      "ldr q3, [%[wh]], #20  \n"
-      "ldr q4, [%[wh]], #20  \n"
-
-      "ld1 {v31.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16   \n"
-      "ld1 {v9.4s}, [%[din1]], #16   \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s \n"
-      "fmul v19.4s, v0.4s, v11.4s \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd  v25.4s, v25.4s, v31.4s \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for one out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_one_out_extract_pre_relu(const float* din0,
-                                      const float* din1,
-                                      const float* din2,
-                                      const float* din3,
-                                      const float* din4,
-                                      const float* din5,
-                                      const float* din6,
-                                      const float* din7,
-                                      float* dout0,
-                                      float* dout1,
-                                      float* dout2,
-                                      float* dout3,
-                                      const float* weights,
-                                      const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"
-      "ldr q1, [%[wh]], #20  \n"
-      "ldr q2, [%[wh]], #20  \n"
-      "ldr q3, [%[wh]], #20  \n"
-      "ldr q4, [%[wh]], #20  \n"
-
-      "ld1 {v8.4s}, [%[din0]], #16   \n"
-      "ld1 {v9.4s}, [%[din1]], #16   \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      "ld1 {v31.4s}, [%[bias]]  \n"
-      "movi v30.4s, #0  \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s  \n"
-      "faddp v26.4s, v18.4s, v19.4s  \n"
-      "faddp v25.4s, v25.4s, v26.4s  \n"
-      "fadd  v25.4s, v25.4s, v31.4s  \n"
-      "fmax  v25.4s, v25.4s, v30.4s  \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v25",
-        "v26",
-        "v30",
-        "v31");
-}
-
-//! kernel for one out with extracting data post
-//! deal with four lines out
-void compute_one_out_extract_post(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  const float* din6,
-                                  const float* din7,
-                                  float* dout0,
-                                  float* dout1,
-                                  float* dout2,
-                                  float* dout3,
-                                  float32x4_t w0,
-                                  float32x4_t w1,
-                                  float32x4_t w2,
-                                  float32x4_t w3,
-                                  float32x4_t w4,
-                                  const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "ld1 {v31.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd v25.4s, v25.4s, v31.4s  \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for one out with extracting data post
-//! deal with four lines out
-void compute_one_out_extract_post_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       const float* din6,
-                                       const float* din7,
-                                       float* dout0,
-                                       float* dout1,
-                                       float* dout2,
-                                       float* dout3,
-                                       float32x4_t w0,
-                                       float32x4_t w1,
-                                       float32x4_t w2,
-                                       float32x4_t w3,
-                                       float32x4_t w4,
-                                       const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      "ld1 {v31.4s}, [%[bias]]  \n"
-      "movi v30.4s, #0  \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd v25.4s, v25.4s, v31.4s  \n"
-      "fmax v25.4s, v25.4s, v30.4s  \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v25",
-        "v26",
-        "v30",
-        "v31");
-}
-
-//! kernel for two out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_two_out_extract_pre(const float* din0,
-                                 const float* din1,
-                                 const float* din2,
-                                 const float* din3,
-                                 const float* din4,
-                                 const float* din5,
-                                 const float* din6,
-                                 const float* din7,
-                                 float* dout0,
-                                 float* dout1,
-                                 float* dout2,
-                                 float* dout3,
-                                 const float* weights,
-                                 const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]], #20  \n"  // 21, 22, 23, 24
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16   \n"
-      "ld1 {v9.4s}, [%[din1]], #16   \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s   \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v8.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v8.4s   \n"
-
-      // zip
-      "zip1 v6.4s, v7.4s, v5.4s  \n"
-      "zip2 v8.4s, v7.4s, v5.4s  \n"
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v31");
-}
-
-//! kernel for two out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_two_out_extract_pre_relu(const float* din0,
-                                      const float* din1,
-                                      const float* din2,
-                                      const float* din3,
-                                      const float* din4,
-                                      const float* din5,
-                                      const float* din6,
-                                      const float* din7,
-                                      float* dout0,
-                                      float* dout1,
-                                      float* dout2,
-                                      float* dout3,
-                                      const float* weights,
-                                      const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]], #20  \n"  // 21, 22, 23, 24
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]      \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s   \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v8.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v8.4s   \n"
-
-      // zip
-      "zip1 v6.4s, v7.4s, v5.4s  \n"
-      "zip2 v8.4s, v7.4s, v5.4s  \n"
-
-      // add bias
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-
-      // relu
-      "fmax v6.4s, v6.4s, v31.4s \n"
-      "fmax v8.4s, v8.4s, v31.4s \n"
-
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v31");
-}
-
-//! kernel for two out with extracting data post
-//! deal with four lines out
-void compute_two_out_extract_post(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  const float* din6,
-                                  const float* din7,
-                                  float* dout0,
-                                  float* dout1,
-                                  float* dout2,
-                                  float* dout3,
-                                  float32x4_t w0,
-                                  float32x4_t w1,
-                                  float32x4_t w2,
-                                  float32x4_t w3,
-                                  float32x4_t w4,
-                                  const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "movi v31.4s, #0  \n"
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s   \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v8.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v8.4s   \n"
-
-      // zip
-      "zip1 v6.4s, v5.4s, v7.4s  \n"
-      "zip2 v8.4s, v5.4s, v7.4s  \n"
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v31");
-}
-
-//! kernel for two out with extracting data post
-//! deal with four lines out
-void compute_two_out_extract_post_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       const float* din6,
-                                       const float* din7,
-                                       float* dout0,
-                                       float* dout1,
-                                       float* dout2,
-                                       float* dout3,
-                                       float32x4_t w0,
-                                       float32x4_t w1,
-                                       float32x4_t w2,
-                                       float32x4_t w3,
-                                       float32x4_t w4,
-                                       const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "movi v31.4s, #0  \n"
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16   \n"
-      "ld1 {v9.4s}, [%[din1]], #16   \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v8.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v8.4s \n"
-
-      // zip
-      "zip1 v6.4s, v5.4s, v7.4s  \n"
-      "zip2 v8.4s, v5.4s, v7.4s  \n"
-
-      // add bias
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-
-      // relu
-      "fmax v6.4s, v6.4s, v31.4s  \n"
-      "fmax v8.4s, v8.4s, v31.4s  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v31");
-}
-
-//! kernel for three out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_three_out_extract_pre(const float* din0,
-                                   const float* din1,
-                                   const float* din2,
-                                   const float* din3,
-                                   const float* din4,
-                                   const float* din5,
-                                   const float* din6,
-                                   const float* din7,
-                                   float* dout0,
-                                   float* dout1,
-                                   float* dout2,
-                                   float* dout3,
-                                   const float* weights,
-                                   const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]], #20  \n"  // 21, 22, 23, 24
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v6.4s \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd  v25.4s, v25.4s, v20.4s \n"
-
-      // zip
-      "zip1 v6.4s, v7.4s, v5.4s  \n"
-      "zip2 v8.4s, v7.4s, v5.4s  \n"
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]], #4  \n"
-      "st1 {v25.s}[1], [%[dout1]], #4  \n"
-      "st1 {v25.s}[2], [%[dout2]], #4  \n"
-      "st1 {v25.s}[3], [%[dout3]], #4  \n"
-
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for three out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_three_out_extract_pre_relu(const float* din0,
-                                        const float* din1,
-                                        const float* din2,
-                                        const float* din3,
-                                        const float* din4,
-                                        const float* din5,
-                                        const float* din6,
-                                        const float* din7,
-                                        float* dout0,
-                                        float* dout1,
-                                        float* dout2,
-                                        float* dout3,
-                                        const float* weights,
-                                        const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]], #20  \n"  // 21, 22, 23, 24
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v6.4s \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd  v25.4s, v25.4s, v20.4s \n"
-      "fmax  v25.4s, v25.4s, v31.4s \n"
-
-      // zip
-      "zip1 v6.4s, v7.4s, v5.4s  \n"
-      "zip2 v8.4s, v7.4s, v5.4s  \n"
-
-      // add bias
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-
-      // relu
-      "fmax v6.4s, v6.4s, v31.4s \n"
-      "fmax v8.4s, v8.4s, v31.4s \n"
-
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]], #4  \n"
-      "st1 {v25.s}[1], [%[dout1]], #4  \n"
-      "st1 {v25.s}[2], [%[dout2]], #4  \n"
-      "st1 {v25.s}[3], [%[dout3]], #4  \n"
-
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for three out with extracting data post
-//! deal with four lines out
-void compute_three_out_extract_post(const float* din0,
-                                    const float* din1,
-                                    const float* din2,
-                                    const float* din3,
-                                    const float* din4,
-                                    const float* din5,
-                                    const float* din6,
-                                    const float* din7,
-                                    float* dout0,
-                                    float* dout1,
-                                    float* dout2,
-                                    float* dout3,
-                                    float32x4_t w0,
-                                    float32x4_t w1,
-                                    float32x4_t w2,
-                                    float32x4_t w3,
-                                    float32x4_t w4,
-                                    const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v6, v8, v25
-  asm volatile(
-      "movi v31.4s, #0  \n"
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4  \n"
-      "ext v9.16b, v9.16b, v31.16b, #4  \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v6.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd  v25.4s, v25.4s, v20.4s \n"
-
-      // zip
-      "zip1 v6.4s, v5.4s, v7.4s  \n"
-      "zip2 v8.4s, v5.4s, v7.4s  \n"
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]], #8  \n"
-      "str d7, [%[dout1]], #8  \n"
-      "str d8, [%[dout2]], #8  \n"
-      "str d9, [%[dout3]], #8  \n"
-
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for three out with extracting data post
-//! deal with four lines out
-void compute_three_out_extract_post_relu(const float* din0,
-                                         const float* din1,
-                                         const float* din2,
-                                         const float* din3,
-                                         const float* din4,
-                                         const float* din5,
-                                         const float* din6,
-                                         const float* din7,
-                                         float* dout0,
-                                         float* dout1,
-                                         float* dout2,
-                                         float* dout3,
-                                         float32x4_t w0,
-                                         float32x4_t w1,
-                                         float32x4_t w2,
-                                         float32x4_t w3,
-                                         float32x4_t w4,
-                                         const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v6, v8, v25
-  asm volatile(
-      "movi v31.4s, #0  \n"
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v6.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd  v25.4s, v25.4s, v20.4s \n"
-      "fmax  v25.4s, v25.4s, v31.4s \n"
-
-      // zip
-      "zip1 v6.4s, v5.4s, v7.4s  \n"
-      "zip2 v8.4s, v5.4s, v7.4s  \n"
-
-      // add bias
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-
-      // relu
-      "fmax v6.4s, v6.4s, v31.4s \n"
-      "fmax v8.4s, v8.4s, v31.4s \n"
-
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]], #8  \n"
-      "str d7, [%[dout1]], #8  \n"
-      "str d8, [%[dout2]], #8  \n"
-      "str d9, [%[dout3]], #8  \n"
-
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for four out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_four_out_extract_pre(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  const float* din6,
-                                  const float* din7,
-                                  float* dout0,
-                                  float* dout1,
-                                  float* dout2,
-                                  float* dout3,
-                                  const float* weights,
-                                  const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v0-v3
-  //! weights: v0-v4, v5, v6
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "mov x0, #20  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]]       \n"  // 21, 22, 23, 24
-      "sub %[wh], %[wh], #68 \n"
-
-      // load inputs
-      "ld1 {v8.4s},  [%[din0]]  \n"
-      "ld1 {v9.4s},  [%[din1]]  \n"
-      "ld1 {v10.4s}, [%[din2]]  \n"
-      "ld1 {v11.4s}, [%[din3]]  \n"
-      "ld1 {v12.4s}, [%[din4]]  \n"
-      "ld1 {v13.4s}, [%[din5]]  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s \n"
-      "fmul v19.4s, v0.4s, v11.4s \n"
-
-      "ld1 {v14.4s}, [%[din6]]  \n"
-      "ld1 {v15.4s}, [%[din7]]  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-
-      // load weights col5
-      "ld1 {v5.s}[0], [%[wh]], x0  \n"
-      "ld1 {v5.s}[1], [%[wh]], x0  \n"
-      "ld1 {v5.s}[2], [%[wh]], x0  \n"
-      "ld1 {v5.s}[3], [%[wh]], x0  \n"
-      "ld1 {v6.s}[0], [%[wh]]      \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v27
-      "faddp v27.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v27.4s, v27.4s, v26.4s \n"
-
-      // load in col5
-      "ld1 {v20.s}[0], [%[din0]] \n"
-      "ld1 {v20.s}[1], [%[din1]] \n"
-      "ld1 {v20.s}[2], [%[din2]] \n"
-      "ld1 {v20.s}[3], [%[din3]] \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 23, 24
-
-      "ld1 {v21.s}[0], [%[din4]] \n"
-      "ld1 {v21.s}[1], [%[din5]] \n"
-      "ld1 {v21.s}[2], [%[din6]] \n"
-      "ld1 {v21.s}[3], [%[din7]] \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s \n"
-      "fmul v19.4s, v0.4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v26
-      "faddp v26.4s, v16.4s, v17.4s \n"
-      "faddp v28.4s, v18.4s, v19.4s \n"
-      "faddp v26.4s, v26.4s, v28.4s \n"
-
-      // ext input col5
-      "ext v22.16b, v20.16b, v21.16b, #4  \n"
-      "ext v23.16b, v20.16b, v21.16b, #8  \n"
-      "ext v24.16b, v20.16b, v21.16b, #12 \n"
-
-      // in col5
-      "fmul v16.4s, v5.4s, v20.4s  \n"
-      "fmul v17.4s, v5.4s, v22.4s  \n"
-      "fmul v18.4s, v5.4s, v23.4s  \n"
-      "fmul v19.4s, v5.4s, v24.4s  \n"
-
-      // add to out register v28
-      "faddp v28.4s, v16.4s, v17.4s \n"
-      "faddp v29.4s, v18.4s, v19.4s \n"
-      "faddp v28.4s, v28.4s, v29.4s \n"
-      "fmla v28.4s, v21.4s, v6.s[0] \n"
-
-      "ld1 {v8.4s}, [%[bias]]  \n"
-
-      // zip
-      "zip1 v0.4s, v28.4s, v26.4s  \n"
-      "zip2 v2.4s, v28.4s, v26.4s  \n"
-      "zip1 v4.4s, v27.4s, v25.4s  \n"
-      "zip2 v6.4s, v27.4s, v25.4s  \n"
-
-      "fadd v0.4s, v0.4s, v8.4s  \n"
-      "fadd v2.4s, v2.4s, v8.4s  \n"
-      "fadd v4.4s, v4.4s, v8.4s  \n"
-      "fadd v6.4s, v6.4s, v8.4s  \n"
-
-      "ext v1.16b, v0.16b, v31.16b, #8  \n"
-      "ext v3.16b, v2.16b, v31.16b, #8  \n"
-      "ext v5.16b, v4.16b, v31.16b, #8  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d0, [%[dout0]], #8  \n"
-      "str d1, [%[dout1]], #8  \n"
-      "str d2, [%[dout2]], #8  \n"
-      "str d3, [%[dout3]], #8  \n"
-
-      "str d4, [%[dout0]]  \n"
-      "str d5, [%[dout1]]  \n"
-      "str d6, [%[dout2]]  \n"
-      "str d7, [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "x0",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v22",
-        "v23",
-        "v24",
-        "v25",
-        "v26",
-        "v27",
-        "v28",
-        "v29",
-        "v31");
-}
-
-//! kernel for four out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_four_out_extract_pre_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       const float* din6,
-                                       const float* din7,
-                                       float* dout0,
-                                       float* dout1,
-                                       float* dout2,
-                                       float* dout3,
-                                       const float* weights,
-                                       const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v0-v3
-  //! weights: v0-v4, v5, v6
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "mov x0, #20  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]]       \n"  // 21, 22, 23, 24
-      "sub %[wh], %[wh], #68 \n"
-
-      // load inputs
-      "ld1 {v8.4s}, [%[din0]]   \n"
-      "ld1 {v9.4s}, [%[din1]]   \n"
-      "ld1 {v10.4s}, [%[din2]]  \n"
-      "ld1 {v11.4s}, [%[din3]]  \n"
-      "ld1 {v12.4s}, [%[din4]]  \n"
-      "ld1 {v13.4s}, [%[din5]]  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s \n"
-      "fmul v19.4s, v0.4s, v11.4s \n"
-
-      "ld1 {v14.4s}, [%[din6]]  \n"
-      "ld1 {v15.4s}, [%[din7]]  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-
-      // load weights col5
-      "ld1 {v5.s}[0], [%[wh]], x0  \n"
-      "ld1 {v5.s}[1], [%[wh]], x0  \n"
-      "ld1 {v5.s}[2], [%[wh]], x0  \n"
-      "ld1 {v5.s}[3], [%[wh]], x0  \n"
-      "ld1 {v6.s}[0], [%[wh]]      \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v27
-      "faddp v27.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v27.4s, v27.4s, v26.4s \n"
-
-      // load in col5
-      "ld1 {v20.s}[0], [%[din0]] \n"
-      "ld1 {v20.s}[1], [%[din1]] \n"
-      "ld1 {v20.s}[2], [%[din2]] \n"
-      "ld1 {v20.s}[3], [%[din3]] \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 23, 24
-
-      "ld1 {v21.s}[0], [%[din4]] \n"
-      "ld1 {v21.s}[1], [%[din5]] \n"
-      "ld1 {v21.s}[2], [%[din6]] \n"
-      "ld1 {v21.s}[3], [%[din7]] \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s \n"
-      "fmul v19.4s, v0.4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v26
-      "faddp v26.4s, v16.4s, v17.4s \n"
-      "faddp v28.4s, v18.4s, v19.4s \n"
-      "faddp v26.4s, v26.4s, v28.4s \n"
-
-      // ext input col5
-      "ext v22.16b, v20.16b, v21.16b, #4  \n"
-      "ext v23.16b, v20.16b, v21.16b, #8  \n"
-      "ext v24.16b, v20.16b, v21.16b, #12 \n"
-
-      // in col5
-      "fmul v16.4s, v5.4s, v20.4s  \n"
-      "fmul v17.4s, v5.4s, v22.4s  \n"
-      "fmul v18.4s, v5.4s, v23.4s  \n"
-      "fmul v19.4s, v5.4s, v24.4s  \n"
-
-      // add to out register v28
-      "faddp v28.4s, v16.4s, v17.4s \n"
-      "faddp v29.4s, v18.4s, v19.4s \n"
-      "faddp v28.4s, v28.4s, v29.4s \n"
-      "fmla v28.4s, v21.4s, v6.s[0] \n"
-
-      "ld1 {v8.4s}, [%[bias]]  \n"
-
-      // zip
-      "zip1 v0.4s, v28.4s, v26.4s  \n"
-      "zip2 v2.4s, v28.4s, v26.4s  \n"
-      "zip1 v4.4s, v27.4s, v25.4s  \n"
-      "zip2 v6.4s, v27.4s, v25.4s  \n"
-
-      // add bias
-      "fadd v0.4s, v0.4s, v8.4s  \n"
-      "fadd v2.4s, v2.4s, v8.4s  \n"
-      "fadd v4.4s, v4.4s, v8.4s  \n"
-      "fadd v6.4s, v6.4s, v8.4s  \n"
-
-      // relu
-      "fmax v0.4s, v0.4s, v31.4s \n"
-      "fmax v2.4s, v2.4s, v31.4s \n"
-      "fmax v4.4s, v4.4s, v31.4s \n"
-      "fmax v6.4s, v6.4s, v31.4s \n"
-
-      "ext v1.16b, v0.16b, v31.16b, #8  \n"
-      "ext v3.16b, v2.16b, v31.16b, #8  \n"
-      "ext v5.16b, v4.16b, v31.16b, #8  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d0, [%[dout0]], #8  \n"
-      "str d1, [%[dout1]], #8  \n"
-      "str d2, [%[dout2]], #8  \n"
-      "str d3, [%[dout3]], #8  \n"
-
-      "str d4, [%[dout0]]  \n"
-      "str d5, [%[dout1]]  \n"
-      "str d6, [%[dout2]]  \n"
-      "str d7, [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "x0",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v22",
-        "v23",
-        "v24",
-        "v25",
-        "v26",
-        "v27",
-        "v28",
-        "v29",
-        "v31");
-}
-
-//! kernel for four out with extracting data post
-//! deal with four lines out
-void compute_four_out_extract_post(const float* din0,
-                                   const float* din1,
-                                   const float* din2,
-                                   const float* din3,
-                                   const float* din4,
-                                   const float* din5,
-                                   const float* din6,
-                                   const float* din7,
-                                   float* dout0,
-                                   float* dout1,
-                                   float* dout2,
-                                   float* dout3,
-                                   float32x4_t w0,
-                                   float32x4_t w1,
-                                   float32x4_t w2,
-                                   float32x4_t w3,
-                                   float32x4_t w4,
-                                   const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v0-v3
-  const int64_t s_12 = 12;
-  const float* doutl[4] = {dout0, dout1, dout2, dout3};
-  void* doutl_ptr = reinterpret_cast<void*>(doutl);
-  asm volatile(
-      "movi v31.4s, #0  \n"
-      "ldp x0, x1, [%[doutl]], #16  \n"
-      "ldp x2, x3, [%[doutl]]  \n"
-
-      // load inputs
-      "ld1 {v8.4s}, [%[din0]], %[s_12]   \n"
-      "ld1 {v9.4s}, [%[din1]], %[s_12]   \n"
-      "ld1 {v10.4s}, [%[din2]], %[s_12]  \n"
-      "ld1 {v11.4s}, [%[din3]], %[s_12]  \n"
-      "ld1 {v12.4s}, [%[din4]], %[s_12]  \n"
-      "ld1 {v13.4s}, [%[din5]], %[s_12]  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s \n"
-      "fmul v19.4s, %[w0].4s, v11.4s \n"
-
-      "ld1 {v14.4s}, [%[din6]], %[s_12]  \n"
-      "ld1 {v15.4s}, [%[din7]], %[s_12]  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-
-      // load input col5
-      "ld1 {v20.s}[0], [%[din0]]  \n"
-      "ld1 {v20.s}[1], [%[din1]]  \n"
-      "ld1 {v20.s}[2], [%[din2]]  \n"
-      "ld1 {v20.s}[3], [%[din3]]  \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4  \n"
-      "ext v9.16b, v9.16b, v31.16b, #4  \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // load input col5
-      "ld1 {v21.s}[0], [%[din4]]  \n"
-      "ld1 {v21.s}[1], [%[din5]]  \n"
-      "ld1 {v21.s}[2], [%[din6]]  \n"
-      "ld1 {v21.s}[3], [%[din7]]  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s \n"
-      "fmul v19.4s, %[w0].4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v27
-      "faddp v27.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v27.4s, v27.4s, v26.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s \n"
-      "fmul v19.4s, %[w0].4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v26
-      "faddp v26.4s, v16.4s, v17.4s \n"
-      "faddp v28.4s, v18.4s, v19.4s \n"
-      "faddp v26.4s, v26.4s, v28.4s \n"
-
-      // ext input col5
-      "ext v8.16b, v20.16b, v21.16b, #4  \n"
-      "ext v9.16b, v20.16b, v21.16b, #8  \n"
-      "ext v10.16b, v20.16b, v21.16b, #12  \n"
-
-      // ext weights col0
-      "ins v5.s[0], %[w0].s[0]  \n"
-      "ins v5.s[1], %[w1].s[0]  \n"
-      "ins v5.s[2], %[w2].s[0]  \n"
-      "ins v5.s[3], %[w3].s[0]  \n"
-
-      // in col5
-      "fmul v16.4s, v5.4s, v20.4s \n"
-      "fmul v17.4s, v5.4s, v8.4s  \n"
-      "fmul v18.4s, v5.4s, v9.4s  \n"
-      "fmul v19.4s, v5.4s, v10.4s \n"
-
-      // add to out register v28
-      "faddp v28.4s, v16.4s, v17.4s \n"
-      "faddp v29.4s, v18.4s, v19.4s \n"
-      "faddp v28.4s, v28.4s, v29.4s \n"
-      "fmla v28.4s, v21.4s, %[w4].s[0]  \n"
-
-      "ld1 {v8.4s}, [%[bias]]  \n"
-
-      // zip
-      "zip1 v0.4s, v25.4s, v27.4s  \n"
-      "zip2 v2.4s, v25.4s, v27.4s  \n"
-      "zip1 v4.4s, v26.4s, v28.4s  \n"
-      "zip2 v6.4s, v26.4s, v28.4s  \n"
-
-      "fadd v0.4s, v0.4s, v8.4s  \n"
-      "fadd v2.4s, v2.4s, v8.4s  \n"
-      "fadd v4.4s, v4.4s, v8.4s  \n"
-      "fadd v6.4s, v6.4s, v8.4s  \n"
-
-      "ext v1.16b, v0.16b, v31.16b, #8  \n"
-      "ext v3.16b, v2.16b, v31.16b, #8  \n"
-      "ext v5.16b, v4.16b, v31.16b, #8  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d0, [x0], #8  \n"
-      "str d1, [x1], #8  \n"
-      "str d2, [x2], #8  \n"
-      "str d3, [x3], #8  \n"
-
-      "str d4, [x0]  \n"
-      "str d5, [x1]  \n"
-      "str d6, [x2]  \n"
-      "str d7, [x3]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [doutl] "+r"(doutl_ptr)
-      : [s_12] "r"(s_12),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "x0",
-        "x1",
-        "x2",
-        "x3",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v5",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v25",
-        "v26",
-        "v27",
-        "v28",
-        "v29",
-        "v31");
-}
-
-//! kernel for four out with extracting data post
-//! deal with four lines out
-void compute_four_out_extract_post_relu(const float* din0,
-                                        const float* din1,
-                                        const float* din2,
-                                        const float* din3,
-                                        const float* din4,
-                                        const float* din5,
-                                        const float* din6,
-                                        const float* din7,
-                                        float* dout0,
-                                        float* dout1,
-                                        float* dout2,
-                                        float* dout3,
-                                        float32x4_t w0,
-                                        float32x4_t w1,
-                                        float32x4_t w2,
-                                        float32x4_t w3,
-                                        float32x4_t w4,
-                                        const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v0-v3
-  const int64_t s_12 = 12;
-  const float* doutl[4] = {dout0, dout1, dout2, dout3};
-  void* doutl_ptr = reinterpret_cast<void*>(doutl);
-  asm volatile(
-      "movi v31.4s, #0  \n"
-      "ldp x0, x1, [%[doutl]], #16  \n"
-      "ldp x2, x3, [%[doutl]]  \n"
-
-      // load inputs
-      "ld1 {v8.4s}, [%[din0]], %[s_12]   \n"
-      "ld1 {v9.4s}, [%[din1]], %[s_12]   \n"
-      "ld1 {v10.4s}, [%[din2]], %[s_12]  \n"
-      "ld1 {v11.4s}, [%[din3]], %[s_12]  \n"
-      "ld1 {v12.4s}, [%[din4]], %[s_12]  \n"
-      "ld1 {v13.4s}, [%[din5]], %[s_12]  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], %[s_12]  \n"
-      "ld1 {v15.4s}, [%[din7]], %[s_12]  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-
-      // load input col5
-      "ld1 {v20.s}[0], [%[din0]]  \n"
-      "ld1 {v20.s}[1], [%[din1]]  \n"
-      "ld1 {v20.s}[2], [%[din2]]  \n"
-      "ld1 {v20.s}[3], [%[din3]]  \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4  \n"
-      "ext v9.16b, v9.16b, v31.16b, #4  \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // load input col5
-      "ld1 {v21.s}[0], [%[din4]]  \n"
-      "ld1 {v21.s}[1], [%[din5]]  \n"
-      "ld1 {v21.s}[2], [%[din6]]  \n"
-      "ld1 {v21.s}[3], [%[din7]]  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s \n"
-      "fmul v19.4s, %[w0].4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v27
-      "faddp v27.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v27.4s, v27.4s, v26.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4  \n"
-      "ext v9.16b, v9.16b, v31.16b, #4  \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s \n"
-      "fmul v19.4s, %[w0].4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v26
-      "faddp v26.4s, v16.4s, v17.4s \n"
-      "faddp v28.4s, v18.4s, v19.4s \n"
-      "faddp v26.4s, v26.4s, v28.4s \n"
-
-      // ext input col5
-      "ext v8.16b, v20.16b, v21.16b, #4   \n"
-      "ext v9.16b, v20.16b, v21.16b, #8   \n"
-      "ext v10.16b, v20.16b, v21.16b, #12 \n"
-
-      // ext weights col0
-      "ins v5.s[0], %[w0].s[0]  \n"
-      "ins v5.s[1], %[w1].s[0]  \n"
-      "ins v5.s[2], %[w2].s[0]  \n"
-      "ins v5.s[3], %[w3].s[0]  \n"
-
-      // in col5
-      "fmul v16.4s, v5.4s, v20.4s \n"
-      "fmul v17.4s, v5.4s, v8.4s  \n"
-      "fmul v18.4s, v5.4s, v9.4s  \n"
-      "fmul v19.4s, v5.4s, v10.4s \n"
-
-      // add to out register v28
-      "faddp v28.4s, v16.4s, v17.4s \n"
-      "faddp v29.4s, v18.4s, v19.4s \n"
-      "faddp v28.4s, v28.4s, v29.4s \n"
-      "fmla v28.4s, v21.4s, %[w4].s[0]  \n"
-
-      "ld1 {v8.4s}, [%[bias]]  \n"
-
-      // zip
-      "zip1 v0.4s, v25.4s, v27.4s  \n"
-      "zip2 v2.4s, v25.4s, v27.4s  \n"
-      "zip1 v4.4s, v26.4s, v28.4s  \n"
-      "zip2 v6.4s, v26.4s, v28.4s  \n"
-
-      // add bias
-      "fadd v0.4s, v0.4s, v8.4s  \n"
-      "fadd v2.4s, v2.4s, v8.4s  \n"
-      "fadd v4.4s, v4.4s, v8.4s  \n"
-      "fadd v6.4s, v6.4s, v8.4s  \n"
-
-      // relu
-      "fmax v0.4s, v0.4s, v31.4s \n"
-      "fmax v2.4s, v2.4s, v31.4s \n"
-      "fmax v4.4s, v4.4s, v31.4s \n"
-      "fmax v6.4s, v6.4s, v31.4s \n"
-
-      "ext v1.16b, v0.16b, v31.16b, #8  \n"
-      "ext v3.16b, v2.16b, v31.16b, #8  \n"
-      "ext v5.16b, v4.16b, v31.16b, #8  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d0, [x0], #8  \n"
-      "str d1, [x1], #8  \n"
-      "str d2, [x2], #8  \n"
-      "str d3, [x3], #8  \n"
-
-      "str d4, [x0]  \n"
-      "str d5, [x1]  \n"
-      "str d6, [x2]  \n"
-      "str d7, [x3]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [doutl] "+r"(doutl_ptr)
-      : [s_12] "r"(s_12),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "x0",
-        "x1",
-        "x2",
-        "x3",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v5",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v25",
-        "v26",
-        "v27",
-        "v28",
-        "v29",
-        "v31");
-}
-
-void conv_depthwise_5x5s1_impl(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int pad,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx) {
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int mid_out = w_out - 2 * pad;
-  int mid_cnt = mid_out >> 2;
-  int mid_remain = mid_out - (mid_cnt << 2);
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_c);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_c;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      const float* din_list[8];
-      const float* dinl[8];
-      //! set din ptr with zero buffer
-      for (int i = 0; i < pad_new; ++i) {
-        din_list[i] = zero_ptr;
-      }
-      //! set din ptr with input data
-      for (int i = pad_new; i < 8; ++i) {
-        din_list[i] = din_ch;
-        din_ch += w_in;
-      }
-
-      //! every h loop, deal with 4 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-      float* dout2 = dout1 + w_out;
-      float* dout3 = dout2 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      float32x4_t w5;
-      float32x4_t w6;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 5);
-      float32x4_t w2 = vld1q_f32(weights_c + 10);
-      float32x4_t w3 = vld1q_f32(weights_c + 15);
-      float32x4_t w4 = vld1q_f32(weights_c + 20);
-      w5 = vsetq_lane_f32(weights_c[4], w5, 0);
-      w5 = vsetq_lane_f32(weights_c[9], w5, 1);
-      w5 = vsetq_lane_f32(weights_c[14], w5, 2);
-      w5 = vsetq_lane_f32(weights_c[19], w5, 3);
-      w6 = vsetq_lane_f32(weights_c[24], w6, 0);
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 4) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 8 - pad_new > h_in) {
-          switch (h + 8 - pad_new - h_in) {
-            case 7:
-              din_list[1] = zero_ptr;
-            case 6:
-              din_list[2] = zero_ptr;
-            case 5:
-              din_list[3] = zero_ptr;
-            case 4:
-              din_list[4] = zero_ptr;
-            case 3:
-              din_list[5] = zero_ptr;
-            case 2:
-              din_list[6] = zero_ptr;
-            case 1:
-              din_list[7] = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 4 > h_out_new) {
-          switch (h + 4 - h_out_new) {
-            case 3:
-              dout1 = write_ptr;
-            case 2:
-              dout2 = write_ptr;
-            case 1:
-              dout3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        //! every h loop, deal with 8 line input
-        dinl[0] = din_list[0];
-        dinl[1] = din_list[1];
-        dinl[2] = din_list[2];
-        dinl[3] = din_list[3];
-        dinl[4] = din_list[4];
-        dinl[5] = din_list[5];
-        dinl[6] = din_list[6];
-        dinl[7] = din_list[7];
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-        float* dout_ptr2 = dout2;
-        float* dout_ptr3 = dout3;
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            vst1q_f32(dout_ptr2, vbias_c);
-            vst1q_f32(dout_ptr3, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_c;
-            *dout_ptr1++ = bias_c;
-            *dout_ptr2++ = bias_c;
-            *dout_ptr3++ = bias_c;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-          dout_ptr2 += pad_0;
-          dout_ptr3 += pad_0;
-        }
-        //! deal with w_out pad_new column pre
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_pre(dinl[0],
-                                         dinl[1],
-                                         dinl[2],
-                                         dinl[3],
-                                         dinl[4],
-                                         dinl[5],
-                                         dinl[6],
-                                         dinl[7],
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         dout_ptr2,
-                                         dout_ptr3,
-                                         weights_ptr,
-                                         vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_pre(dinl[0],
-                                          dinl[1],
-                                          dinl[2],
-                                          dinl[3],
-                                          dinl[4],
-                                          dinl[5],
-                                          dinl[6],
-                                          dinl[7],
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          dout_ptr2,
-                                          dout_ptr3,
-                                          weights_ptr,
-                                          vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            dout_ptr2 += 3;
-            dout_ptr3 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_pre(dinl[0],
-                                        dinl[1],
-                                        dinl[2],
-                                        dinl[3],
-                                        dinl[4],
-                                        dinl[5],
-                                        dinl[6],
-                                        dinl[7],
-                                        dout_ptr0,
-                                        dout_ptr1,
-                                        dout_ptr2,
-                                        dout_ptr3,
-                                        weights_ptr,
-                                        vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            dout_ptr2 += 2;
-            dout_ptr3 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_pre(dinl[0],
-                                        dinl[1],
-                                        dinl[2],
-                                        dinl[3],
-                                        dinl[4],
-                                        dinl[5],
-                                        dinl[6],
-                                        dinl[7],
-                                        dout_ptr0,
-                                        dout_ptr1,
-                                        dout_ptr2,
-                                        dout_ptr3,
-                                        weights_ptr,
-                                        vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            dout_ptr2 += 1;
-            dout_ptr3 += 1;
-            break;
-        }
-        //! mid loop
-        if (mid_cnt > 0) {
-          void* dinl_ptr = reinterpret_cast<void*>(dinl);
-          int mid_loop = mid_cnt;
-          asm volatile(
-              //! din: v7-v14
-              //! dout: v15-v18
-              "mov x0, #0  \n"
-              "mov x1, #4  \n"
-              "ldp x2, x3, [%[dinl]], #16  \n"
-              "ldp x4, x5, [%[dinl]], #16  \n"
-              "ldp x6, x7, [%[dinl]], #16  \n"
-              "ldp x8, x9, [%[dinl]], #16  \n"
-
-              "ld1 {v7.4s} , [x2], x1  \n"
-              "ld1 {v8.4s} , [x3], x1  \n"
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              //! load bias
-              "ld1 {v19.4s}, [%[bias]]  \n"
-
-              "1: \n"
-              //! add bias to output
-              "mov v15.16b, v19.16b  \n"
-              "mov v16.16b, v19.16b  \n"
-              "mov v17.16b, v19.16b  \n"
-              "mov v18.16b, v19.16b  \n"
-
-              //! loop cnt is even, prefetch 64 Byte to l1 cache
-              "cmp x0, #1  \n"
-              "bne 2f  \n"
-              "mov x0, #0  \n"
-              "prfm pldl1keep, [x2]  \n"
-              "prfm pldl1keep, [x3]  \n"
-              "prfm pldl1keep, [x4]  \n"
-              "prfm pldl1keep, [x5]  \n"
-              "prfm pldl1keep, [x6]  \n"
-              "prfm pldl1keep, [x7]  \n"
-              "prfm pldl1keep, [x8]  \n"
-              "prfm pldl1keep, [x9]  \n"
-
-              "2:  \n"
-              // weights col 0
-              "fmla v15.4s, v7.4s , %[w0].s[0]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[0]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[0]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[0]  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[0]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[0]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[0]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[0]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[0]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[0]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[0]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[0]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[0]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[0]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[0]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[0]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[0]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[0]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[0]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[0]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 1
-              "fmla v15.4s, v7.4s , %[w0].s[1]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[1]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[1]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[1]  \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[1]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[1]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[1]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[1]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[1]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[1]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[1]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[1]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[1]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[1]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[1]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[1]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[1]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[1]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[1]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[1]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 2
-              "fmla v15.4s, v7.4s , %[w0].s[2]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[2]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[2]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[2]  \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[2]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[2]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[2]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[2]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[2]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[2]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[2]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[2]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[2]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[2]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[2]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[2]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[2]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[2]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[2]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[2]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 3
-              "fmla v15.4s, v7.4s , %[w0].s[3] \n"
-              "fmla v16.4s, v8.4s , %[w0].s[3] \n"
-              "fmla v17.4s, v9.4s , %[w0].s[3] \n"
-              "fmla v18.4s, v10.4s, %[w0].s[3] \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[3]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[3]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[3]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[3]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[3]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[3]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[3]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[3]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[3]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[3]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[3]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[3]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[3]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[3]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[3]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[3]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 4
-              "fmla v15.4s, v7.4s, %[w5].s[0]  \n"
-              "fmla v16.4s, v8.4s, %[w5].s[0]  \n"
-              "fmla v17.4s, v9.4s, %[w5].s[0]  \n"
-              "fmla v18.4s, v10.4s, %[w5].s[0] \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s, %[w5].s[1]   \n"
-              "fmla v16.4s, v9.4s, %[w5].s[1]   \n"
-              "fmla v17.4s, v10.4s, %[w5].s[1]  \n"
-              "fmla v18.4s, v11.4s, %[w5].s[1]  \n"
-
-              "fmla v15.4s, v9.4s , %[w5].s[2]  \n"
-              "fmla v16.4s, v10.4s, %[w5].s[2]  \n"
-              "fmla v17.4s, v11.4s, %[w5].s[2]  \n"
-              "fmla v18.4s, v12.4s, %[w5].s[2]  \n"
-
-              "fmla v15.4s, v10.4s, %[w5].s[3]  \n"
-              "fmla v16.4s, v11.4s, %[w5].s[3]  \n"
-              "fmla v17.4s, v12.4s, %[w5].s[3]  \n"
-              "fmla v18.4s, v13.4s, %[w5].s[3]  \n"
-
-              "fmla v15.4s, v11.4s, %[w6].s[0]  \n"
-              "fmla v16.4s, v12.4s, %[w6].s[0]  \n"
-              "fmla v17.4s, v13.4s, %[w6].s[0]  \n"
-              "fmla v18.4s, v14.4s, %[w6].s[0]  \n"
-
-              "st1 {v15.4s}, [%[dout0]], #16  \n"
-              "st1 {v16.4s}, [%[dout1]], #16  \n"
-              "st1 {v17.4s}, [%[dout2]], #16  \n"
-              "st1 {v18.4s}, [%[dout3]], #16  \n"
-
-              "subs %w[cnt], %w[cnt], #1  \n"
-              "add x0, x0, #1  \n"
-              "bne 1b  \n"
-
-              : [dout0] "+r"(dout_ptr0),
-                [dout1] "+r"(dout_ptr1),
-                [dout2] "+r"(dout_ptr2),
-                [dout3] "+r"(dout_ptr3),
-                [cnt] "+r"(mid_loop),
-                [dinl] "+r"(dinl_ptr)
-              : [w0] "w"(w0),
-                [w1] "w"(w1),
-                [w2] "w"(w2),
-                [w3] "w"(w3),
-                [w4] "w"(w4),
-                [w5] "w"(w5),
-                [w6] "w"(w6),
-                [bias] "r"(vbias)
-              : "cc",
-                "memory",
-                "x0",
-                "x1",
-                "x2",
-                "x3",
-                "x4",
-                "x5",
-                "x6",
-                "x7",
-                "x8",
-                "x9",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19");
-        }
-        dinl[0] += 4 * mid_cnt;
-        dinl[1] += 4 * mid_cnt;
-        dinl[2] += 4 * mid_cnt;
-        dinl[3] += 4 * mid_cnt;
-        dinl[4] += 4 * mid_cnt;
-        dinl[5] += 4 * mid_cnt;
-        dinl[6] += 4 * mid_cnt;
-        dinl[7] += 4 * mid_cnt;
-        //! deal with mid remain
-        for (int i = 0; i < mid_remain; ++i) {
-          compute_one_out_without_extract(dinl[0],
-                                          dinl[1],
-                                          dinl[2],
-                                          dinl[3],
-                                          dinl[4],
-                                          dinl[5],
-                                          dinl[6],
-                                          dinl[7],
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          dout_ptr2,
-                                          dout_ptr3,
-                                          w0,
-                                          w1,
-                                          w2,
-                                          w3,
-                                          w4,
-                                          w5,
-                                          w6,
-                                          vbias);
-          dinl[0]++;
-          dinl[1]++;
-          dinl[2]++;
-          dinl[3]++;
-          dinl[4]++;
-          dinl[5]++;
-          dinl[6]++;
-          dinl[7]++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-          dout_ptr2++;
-          dout_ptr3++;
-        }
-        //! deal with w_out pad_new column post
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_post(dinl[0],
-                                          dinl[1],
-                                          dinl[2],
-                                          dinl[3],
-                                          dinl[4],
-                                          dinl[5],
-                                          dinl[6],
-                                          dinl[7],
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          dout_ptr2,
-                                          dout_ptr3,
-                                          w0,
-                                          w1,
-                                          w2,
-                                          w3,
-                                          w4,
-                                          vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_post(dinl[0],
-                                           dinl[1],
-                                           dinl[2],
-                                           dinl[3],
-                                           dinl[4],
-                                           dinl[5],
-                                           dinl[6],
-                                           dinl[7],
-                                           dout_ptr0,
-                                           dout_ptr1,
-                                           dout_ptr2,
-                                           dout_ptr3,
-                                           w0,
-                                           w1,
-                                           w2,
-                                           w3,
-                                           w4,
-                                           vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            dout_ptr2 += 3;
-            dout_ptr3 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_post(dinl[0],
-                                         dinl[1],
-                                         dinl[2],
-                                         dinl[3],
-                                         dinl[4],
-                                         dinl[5],
-                                         dinl[6],
-                                         dinl[7],
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         dout_ptr2,
-                                         dout_ptr3,
-                                         w0,
-                                         w1,
-                                         w2,
-                                         w3,
-                                         w4,
-                                         vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            dout_ptr2 += 2;
-            dout_ptr3 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_post(dinl[0],
-                                         dinl[1],
-                                         dinl[2],
-                                         dinl[3],
-                                         dinl[4],
-                                         dinl[5],
-                                         dinl[6],
-                                         dinl[7],
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         dout_ptr2,
-                                         dout_ptr3,
-                                         w0,
-                                         w1,
-                                         w2,
-                                         w3,
-                                         w4,
-                                         vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            dout_ptr2 += 1;
-            dout_ptr3 += 1;
-            break;
-        }
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-          memcpy(dout_ptr2, dout2, pad_0 * sizeof(float));
-          memcpy(dout_ptr3, dout3, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-        }
-
-        din_list[0] = din_list[4];
-        din_list[1] = din_list[5];
-        din_list[2] = din_list[6];
-        din_list[3] = din_list[7];
-        din_list[4] = din_list[3] + w_in;
-        din_list[5] = din_list[4] + w_in;
-        din_list[6] = din_list[5] + w_in;
-        din_list[7] = din_list[6] + w_in;
-
-        dout0 = dout3 + w_out;
-        dout1 = dout0 + w_out;
-        dout2 = dout1 + w_out;
-        dout3 = dout2 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-}
-
-void conv_depthwise_5x5s1_relu_impl(const float* din,
-                                    float* dout,
-                                    int num,
-                                    int ch_out,
-                                    int h_out,
-                                    int w_out,
-                                    int ch_in,
-                                    int h_in,
-                                    int w_in,
-                                    const float* weights,
-                                    const float* bias,
-                                    int pad,
-                                    bool flag_bias,
-                                    bool flag_relu,
-                                    ARMContext* ctx) {
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int mid_out = w_out - 2 * pad;
-  int mid_cnt = mid_out >> 2;
-  int mid_remain = mid_out - (mid_cnt << 2);
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float bias_relu = bias_c > 0.f ? bias_c : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_relu);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_relu;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      const float* din_list[8];
-      const float* dinl[8];
-      //! set din ptr with zero buffer
-      for (int i = 0; i < pad_new; ++i) {
-        din_list[i] = zero_ptr;
-      }
-      //! set din ptr with input data
-      for (int i = pad_new; i < 8; ++i) {
-        din_list[i] = din_ch;
-        din_ch += w_in;
-      }
-
-      //! every h loop, deal with 4 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-      float* dout2 = dout1 + w_out;
-      float* dout3 = dout2 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      float32x4_t w5;
-      float32x4_t w6;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 5);
-      float32x4_t w2 = vld1q_f32(weights_c + 10);
-      float32x4_t w3 = vld1q_f32(weights_c + 15);
-      float32x4_t w4 = vld1q_f32(weights_c + 20);
-      w5 = vsetq_lane_f32(weights_c[4], w5, 0);
-      w5 = vsetq_lane_f32(weights_c[9], w5, 1);
-      w5 = vsetq_lane_f32(weights_c[14], w5, 2);
-      w5 = vsetq_lane_f32(weights_c[19], w5, 3);
-      w6 = vsetq_lane_f32(weights_c[24], w6, 0);
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 4) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 8 - pad_new > h_in) {
-          switch (h + 8 - pad_new - h_in) {
-            case 7:
-              din_list[1] = zero_ptr;
-            case 6:
-              din_list[2] = zero_ptr;
-            case 5:
-              din_list[3] = zero_ptr;
-            case 4:
-              din_list[4] = zero_ptr;
-            case 3:
-              din_list[5] = zero_ptr;
-            case 2:
-              din_list[6] = zero_ptr;
-            case 1:
-              din_list[7] = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 4 > h_out_new) {
-          switch (h + 4 - h_out_new) {
-            case 3:
-              dout1 = write_ptr;
-            case 2:
-              dout2 = write_ptr;
-            case 1:
-              dout3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        //! every h loop, deal with 8 line input
-        dinl[0] = din_list[0];
-        dinl[1] = din_list[1];
-        dinl[2] = din_list[2];
-        dinl[3] = din_list[3];
-        dinl[4] = din_list[4];
-        dinl[5] = din_list[5];
-        dinl[6] = din_list[6];
-        dinl[7] = din_list[7];
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-        float* dout_ptr2 = dout2;
-        float* dout_ptr3 = dout3;
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            vst1q_f32(dout_ptr2, vbias_c);
-            vst1q_f32(dout_ptr3, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_relu;
-            *dout_ptr1++ = bias_relu;
-            *dout_ptr2++ = bias_relu;
-            *dout_ptr3++ = bias_relu;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-          dout_ptr2 += pad_0;
-          dout_ptr3 += pad_0;
-        }
-        //! deal with w_out pad_new column pre
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_pre_relu(dinl[0],
-                                              dinl[1],
-                                              dinl[2],
-                                              dinl[3],
-                                              dinl[4],
-                                              dinl[5],
-                                              dinl[6],
-                                              dinl[7],
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              dout_ptr2,
-                                              dout_ptr3,
-                                              weights_ptr,
-                                              vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_pre_relu(dinl[0],
-                                               dinl[1],
-                                               dinl[2],
-                                               dinl[3],
-                                               dinl[4],
-                                               dinl[5],
-                                               dinl[6],
-                                               dinl[7],
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               dout_ptr2,
-                                               dout_ptr3,
-                                               weights_ptr,
-                                               vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            dout_ptr2 += 3;
-            dout_ptr3 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_pre_relu(dinl[0],
-                                             dinl[1],
-                                             dinl[2],
-                                             dinl[3],
-                                             dinl[4],
-                                             dinl[5],
-                                             dinl[6],
-                                             dinl[7],
-                                             dout_ptr0,
-                                             dout_ptr1,
-                                             dout_ptr2,
-                                             dout_ptr3,
-                                             weights_ptr,
-                                             vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            dout_ptr2 += 2;
-            dout_ptr3 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_pre_relu(dinl[0],
-                                             dinl[1],
-                                             dinl[2],
-                                             dinl[3],
-                                             dinl[4],
-                                             dinl[5],
-                                             dinl[6],
-                                             dinl[7],
-                                             dout_ptr0,
-                                             dout_ptr1,
-                                             dout_ptr2,
-                                             dout_ptr3,
-                                             weights_ptr,
-                                             vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            dout_ptr2 += 1;
-            dout_ptr3 += 1;
-            break;
-        }
-        //! mid loop
-        if (mid_cnt > 0) {
-          void* dinl_ptr = reinterpret_cast<void*>(dinl);
-          int mid_loop = mid_cnt;
-          asm volatile(
-              //! din: v7-v14
-              //! dout: v15-v18
-              "mov x0, #0  \n"
-              "mov x1, #4  \n"
-              "movi v31.4s, #0  \n"
-              "ldp x2, x3, [%[dinl]], #16  \n"
-              "ldp x4, x5, [%[dinl]], #16  \n"
-              "ldp x6, x7, [%[dinl]], #16  \n"
-              "ldp x8, x9, [%[dinl]], #16  \n"
-
-              "ld1 {v7.4s} , [x2], x1  \n"
-              "ld1 {v8.4s} , [x3], x1  \n"
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              //! load bias
-              "ld1 {v19.4s}, [%[bias]]  \n"
-
-              "1: \n"
-              //! add bias to output
-              "mov v15.16b, v19.16b  \n"
-              "mov v16.16b, v19.16b  \n"
-              "mov v17.16b, v19.16b  \n"
-              "mov v18.16b, v19.16b  \n"
-
-              //! loop cnt is even, prefetch 64 Byte to l1 cache
-              "cmp x0, #1  \n"
-              "bne 2f  \n"
-              "mov x0, #0  \n"
-              "prfm pldl1keep, [x2]  \n"
-              "prfm pldl1keep, [x3]  \n"
-              "prfm pldl1keep, [x4]  \n"
-              "prfm pldl1keep, [x5]  \n"
-              "prfm pldl1keep, [x6]  \n"
-              "prfm pldl1keep, [x7]  \n"
-              "prfm pldl1keep, [x8]  \n"
-              "prfm pldl1keep, [x9]  \n"
-
-              "2:  \n"
-              // weights col 0
-              "fmla v15.4s, v7.4s , %[w0].s[0]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[0]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[0]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[0]  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[0]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[0]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[0]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[0]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[0]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[0]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[0]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[0]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[0]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[0]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[0]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[0]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[0]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[0]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[0]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[0]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 1
-              "fmla v15.4s, v7.4s , %[w0].s[1]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[1]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[1]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[1]  \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[1]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[1]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[1]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[1]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[1]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[1]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[1]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[1]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[1]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[1]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[1]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[1]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[1]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[1]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[1]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[1]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 2
-              "fmla v15.4s, v7.4s , %[w0].s[2]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[2]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[2]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[2]  \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[2]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[2]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[2]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[2]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[2]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[2]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[2]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[2]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[2]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[2]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[2]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[2]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[2]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[2]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[2]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[2]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 3
-              "fmla v15.4s, v7.4s , %[w0].s[3] \n"
-              "fmla v16.4s, v8.4s , %[w0].s[3] \n"
-              "fmla v17.4s, v9.4s , %[w0].s[3] \n"
-              "fmla v18.4s, v10.4s, %[w0].s[3] \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[3]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[3]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[3]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[3]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[3]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[3]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[3]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[3]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[3]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[3]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[3]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[3]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[3]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[3]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[3]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[3]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 4
-              "fmla v15.4s, v7.4s, %[w5].s[0]  \n"
-              "fmla v16.4s, v8.4s, %[w5].s[0]  \n"
-              "fmla v17.4s, v9.4s, %[w5].s[0]  \n"
-              "fmla v18.4s, v10.4s, %[w5].s[0] \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s, %[w5].s[1]   \n"
-              "fmla v16.4s, v9.4s, %[w5].s[1]   \n"
-              "fmla v17.4s, v10.4s, %[w5].s[1]  \n"
-              "fmla v18.4s, v11.4s, %[w5].s[1]  \n"
-
-              "fmla v15.4s, v9.4s , %[w5].s[2]  \n"
-              "fmla v16.4s, v10.4s, %[w5].s[2]  \n"
-              "fmla v17.4s, v11.4s, %[w5].s[2]  \n"
-              "fmla v18.4s, v12.4s, %[w5].s[2]  \n"
-
-              "fmla v15.4s, v10.4s, %[w5].s[3]  \n"
-              "fmla v16.4s, v11.4s, %[w5].s[3]  \n"
-              "fmla v17.4s, v12.4s, %[w5].s[3]  \n"
-              "fmla v18.4s, v13.4s, %[w5].s[3]  \n"
-
-              "fmla v15.4s, v11.4s, %[w6].s[0]  \n"
-              "fmla v16.4s, v12.4s, %[w6].s[0]  \n"
-              "fmla v17.4s, v13.4s, %[w6].s[0]  \n"
-              "fmla v18.4s, v14.4s, %[w6].s[0]  \n"
-
-              "fmax v15.4s, v15.4s, v31.4s  \n"
-              "fmax v16.4s, v16.4s, v31.4s  \n"
-              "fmax v17.4s, v17.4s, v31.4s  \n"
-              "fmax v18.4s, v18.4s, v31.4s  \n"
-
-              "st1 {v15.4s}, [%[dout0]], #16  \n"
-              "st1 {v16.4s}, [%[dout1]], #16  \n"
-              "st1 {v17.4s}, [%[dout2]], #16  \n"
-              "st1 {v18.4s}, [%[dout3]], #16  \n"
-
-              "subs %w[cnt], %w[cnt], #1  \n"
-              "add x0, x0, #1  \n"
-              "bne 1b  \n"
-
-              : [dout0] "+r"(dout_ptr0),
-                [dout1] "+r"(dout_ptr1),
-                [dout2] "+r"(dout_ptr2),
-                [dout3] "+r"(dout_ptr3),
-                [cnt] "+r"(mid_loop),
-                [dinl] "+r"(dinl_ptr)
-              : [w0] "w"(w0),
-                [w1] "w"(w1),
-                [w2] "w"(w2),
-                [w3] "w"(w3),
-                [w4] "w"(w4),
-                [w5] "w"(w5),
-                [w6] "w"(w6),
-                [bias] "r"(vbias)
-              : "cc",
-                "memory",
-                "x0",
-                "x1",
-                "x2",
-                "x3",
-                "x4",
-                "x5",
-                "x6",
-                "x7",
-                "x8",
-                "x9",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v31");
-        }
-        dinl[0] += 4 * mid_cnt;
-        dinl[1] += 4 * mid_cnt;
-        dinl[2] += 4 * mid_cnt;
-        dinl[3] += 4 * mid_cnt;
-        dinl[4] += 4 * mid_cnt;
-        dinl[5] += 4 * mid_cnt;
-        dinl[6] += 4 * mid_cnt;
-        dinl[7] += 4 * mid_cnt;
-        //! deal with mid remain
-        for (int i = 0; i < mid_remain; ++i) {
-          compute_one_out_without_extract_relu(dinl[0],
-                                               dinl[1],
-                                               dinl[2],
-                                               dinl[3],
-                                               dinl[4],
-                                               dinl[5],
-                                               dinl[6],
-                                               dinl[7],
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               dout_ptr2,
-                                               dout_ptr3,
-                                               w0,
-                                               w1,
-                                               w2,
-                                               w3,
-                                               w4,
-                                               w5,
-                                               w6,
-                                               vbias);
-          dinl[0]++;
-          dinl[1]++;
-          dinl[2]++;
-          dinl[3]++;
-          dinl[4]++;
-          dinl[5]++;
-          dinl[6]++;
-          dinl[7]++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-          dout_ptr2++;
-          dout_ptr3++;
-        }
-        //! deal with w_out pad_new column post
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_post_relu(dinl[0],
-                                               dinl[1],
-                                               dinl[2],
-                                               dinl[3],
-                                               dinl[4],
-                                               dinl[5],
-                                               dinl[6],
-                                               dinl[7],
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               dout_ptr2,
-                                               dout_ptr3,
-                                               w0,
-                                               w1,
-                                               w2,
-                                               w3,
-                                               w4,
-                                               vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_post_relu(dinl[0],
-                                                dinl[1],
-                                                dinl[2],
-                                                dinl[3],
-                                                dinl[4],
-                                                dinl[5],
-                                                dinl[6],
-                                                dinl[7],
-                                                dout_ptr0,
-                                                dout_ptr1,
-                                                dout_ptr2,
-                                                dout_ptr3,
-                                                w0,
-                                                w1,
-                                                w2,
-                                                w3,
-                                                w4,
-                                                vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            dout_ptr2 += 3;
-            dout_ptr3 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_post_relu(dinl[0],
-                                              dinl[1],
-                                              dinl[2],
-                                              dinl[3],
-                                              dinl[4],
-                                              dinl[5],
-                                              dinl[6],
-                                              dinl[7],
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              dout_ptr2,
-                                              dout_ptr3,
-                                              w0,
-                                              w1,
-                                              w2,
-                                              w3,
-                                              w4,
-                                              vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            dout_ptr2 += 2;
-            dout_ptr3 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_post_relu(dinl[0],
-                                              dinl[1],
-                                              dinl[2],
-                                              dinl[3],
-                                              dinl[4],
-                                              dinl[5],
-                                              dinl[6],
-                                              dinl[7],
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              dout_ptr2,
-                                              dout_ptr3,
-                                              w0,
-                                              w1,
-                                              w2,
-                                              w3,
-                                              w4,
-                                              vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            dout_ptr2 += 1;
-            dout_ptr3 += 1;
-            break;
-        }
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-          memcpy(dout_ptr2, dout2, pad_0 * sizeof(float));
-          memcpy(dout_ptr3, dout3, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-        }
-
-        din_list[0] = din_list[4];
-        din_list[1] = din_list[5];
-        din_list[2] = din_list[6];
-        din_list[3] = din_list[7];
-        din_list[4] = din_list[3] + w_in;
-        din_list[5] = din_list[4] + w_in;
-        din_list[6] = din_list[5] + w_in;
-        din_list[7] = din_list[6] + w_in;
-
-        dout0 = dout3 + w_out;
-        dout1 = dout0 + w_out;
-        dout2 = dout1 + w_out;
-        dout3 = dout2 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-}
-
-void conv_depthwise_5x5s1_small_impl(const float* din,
-                                     float* dout,
-                                     int num,
-                                     int ch_out,
-                                     int h_out,
-                                     int w_out,
-                                     int ch_in,
-                                     int h_in,
-                                     int w_in,
-                                     const float* weights,
-                                     const float* bias,
-                                     int pad,
-                                     bool flag_bias,
-                                     bool flag_relu,
-                                     ARMContext* ctx) {
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_in_new = h_in + 2 * pad_new;
-  int w_in_new = w_in + 2 * pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int w_out_new = w_out - 2 * pad_0;
-  float zero_ptr[w_in_new + w_out];  // NOLINT
-  memset(zero_ptr, 0, w_in_new * sizeof(float));
-  float* write_ptr = zero_ptr + w_in_new;
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in_new * h_in_new;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new);
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din_new + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_c);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_c;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      //! every h loop, deal with 8 line input
-      const float* din0 = din_ch;
-      const float* din1 = din0 + w_in_new;
-      const float* din2 = din1 + w_in_new;
-      const float* din3 = din2 + w_in_new;
-      const float* din4 = din3 + w_in_new;
-      const float* din5 = din4 + w_in_new;
-      const float* din6 = din5 + w_in_new;
-      const float* din7 = din6 + w_in_new;
-      //! every h loop, deal with 4 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-      float* dout2 = dout1 + w_out;
-      float* dout3 = dout2 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      float32x4_t w5;
-      float32x4_t w6;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 5);
-      float32x4_t w2 = vld1q_f32(weights_c + 10);
-      float32x4_t w3 = vld1q_f32(weights_c + 15);
-      float32x4_t w4 = vld1q_f32(weights_c + 20);
-      w5 = vsetq_lane_f32(weights_c[4], w5, 0);
-      w5 = vsetq_lane_f32(weights_c[9], w5, 1);
-      w5 = vsetq_lane_f32(weights_c[14], w5, 2);
-      w5 = vsetq_lane_f32(weights_c[19], w5, 3);
-      w6 = vsetq_lane_f32(weights_c[24], w6, 0);
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 4) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 8 > h_in_new) {
-          switch (h + 8 - h_in_new) {
-            case 7:
-              din1 = zero_ptr;
-            case 6:
-              din2 = zero_ptr;
-            case 5:
-              din3 = zero_ptr;
-            case 4:
-              din4 = zero_ptr;
-            case 3:
-              din5 = zero_ptr;
-            case 2:
-              din6 = zero_ptr;
-            case 1:
-              din7 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 4 > h_out_new) {
-          switch (h + 4 - h_out_new) {
-            case 3:
-              dout1 = write_ptr;
-            case 2:
-              dout2 = write_ptr;
-            case 1:
-              dout3 = write_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-        const float* din_ptr6 = din6;
-        const float* din_ptr7 = din7;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-        float* dout_ptr2 = dout2;
-        float* dout_ptr3 = dout3;
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            vst1q_f32(dout_ptr2, vbias_c);
-            vst1q_f32(dout_ptr3, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_c;
-            *dout_ptr1++ = bias_c;
-            *dout_ptr2++ = bias_c;
-            *dout_ptr3++ = bias_c;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-          dout_ptr2 += pad_0;
-          dout_ptr3 += pad_0;
-        }
-        //! mid loop
-        for (int i = 0; i < w_out_new; ++i) {
-          compute_one_out_without_extract(din_ptr0,
-                                          din_ptr1,
-                                          din_ptr2,
-                                          din_ptr3,
-                                          din_ptr4,
-                                          din_ptr5,
-                                          din_ptr6,
-                                          din_ptr7,
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          dout_ptr2,
-                                          dout_ptr3,
-                                          w0,
-                                          w1,
-                                          w2,
-                                          w3,
-                                          w4,
-                                          w5,
-                                          w6,
-                                          vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-          din_ptr6++;
-          din_ptr7++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-          dout_ptr2++;
-          dout_ptr3++;
-        }
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-          memcpy(dout_ptr2, dout2, pad_0 * sizeof(float));
-          memcpy(dout_ptr3, dout3, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din4;
-        din1 = din5;
-        din2 = din6;
-        din3 = din7;
-        din4 = din3 + w_in_new;
-        din5 = din4 + w_in_new;
-        din6 = din5 + w_in_new;
-        din7 = din6 + w_in_new;
-
-        dout0 = dout3 + w_out;
-        dout1 = dout0 + w_out;
-        dout2 = dout1 + w_out;
-        dout3 = dout2 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-  free(din_new);
-}
-
-void conv_depthwise_5x5s1_small_relu_impl(const float* din,
-                                          float* dout,
-                                          int num,
-                                          int ch_out,
-                                          int h_out,
-                                          int w_out,
-                                          int ch_in,
-                                          int h_in,
-                                          int w_in,
-                                          const float* weights,
-                                          const float* bias,
-                                          int pad,
-                                          bool flag_bias,
-                                          bool flag_relu,
-                                          ARMContext* ctx) {
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_in_new = h_in + 2 * pad_new;
-  int w_in_new = w_in + 2 * pad_new;
-  float zero_ptr[w_in_new + w_out];  // NOLINT
-  memset(zero_ptr, 0, w_in_new * sizeof(float));
-  float* write_ptr = zero_ptr + w_in_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int w_out_new = w_out - 2 * pad_0;
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in_new * h_in_new;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new);
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din_new + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float bias_relu = bias_c > 0.f ? bias_c : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_relu);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_relu;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-
-      //! every h loop, deal with 8 line input
-      const float* din0 = din_ch;
-      const float* din1 = din0 + w_in_new;
-      const float* din2 = din1 + w_in_new;
-      const float* din3 = din2 + w_in_new;
-      const float* din4 = din3 + w_in_new;
-      const float* din5 = din4 + w_in_new;
-      const float* din6 = din5 + w_in_new;
-      const float* din7 = din6 + w_in_new;
-      //! every h loop, deal with 4 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-      float* dout2 = dout1 + w_out;
-      float* dout3 = dout2 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      float32x4_t w5;
-      float32x4_t w6;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 5);
-      float32x4_t w2 = vld1q_f32(weights_c + 10);
-      float32x4_t w3 = vld1q_f32(weights_c + 15);
-      float32x4_t w4 = vld1q_f32(weights_c + 20);
-      w5 = vsetq_lane_f32(weights_c[4], w5, 0);
-      w5 = vsetq_lane_f32(weights_c[9], w5, 1);
-      w5 = vsetq_lane_f32(weights_c[14], w5, 2);
-      w5 = vsetq_lane_f32(weights_c[19], w5, 3);
-      w6 = vsetq_lane_f32(weights_c[24], w6, 0);
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 4) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 8 > h_in_new) {
-          switch (h + 8 - h_in_new) {
-            case 7:
-              din1 = zero_ptr;
-            case 6:
-              din2 = zero_ptr;
-            case 5:
-              din3 = zero_ptr;
-            case 4:
-              din4 = zero_ptr;
-            case 3:
-              din5 = zero_ptr;
-            case 2:
-              din6 = zero_ptr;
-            case 1:
-              din7 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 4 > h_out_new) {
-          switch (h + 4 - h_out_new) {
-            case 3:
-              dout1 = write_ptr;
-            case 2:
-              dout2 = write_ptr;
-            case 1:
-              dout3 = write_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-        const float* din_ptr6 = din6;
-        const float* din_ptr7 = din7;
-
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-        float* dout_ptr2 = dout2;
-        float* dout_ptr3 = dout3;
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            vst1q_f32(dout_ptr2, vbias_c);
-            vst1q_f32(dout_ptr3, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_relu;
-            *dout_ptr1++ = bias_relu;
-            *dout_ptr2++ = bias_relu;
-            *dout_ptr3++ = bias_relu;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-          dout_ptr2 += pad_0;
-          dout_ptr3 += pad_0;
-        }
-
-        //! mid loop
-        for (int i = 0; i < w_out_new; ++i) {
-          compute_one_out_without_extract_relu(din_ptr0,
-                                               din_ptr1,
-                                               din_ptr2,
-                                               din_ptr3,
-                                               din_ptr4,
-                                               din_ptr5,
-                                               din_ptr6,
-                                               din_ptr7,
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               dout_ptr2,
-                                               dout_ptr3,
-                                               w0,
-                                               w1,
-                                               w2,
-                                               w3,
-                                               w4,
-                                               w5,
-                                               w6,
-                                               vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-          din_ptr6++;
-          din_ptr7++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-          dout_ptr2++;
-          dout_ptr3++;
-        }
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-          memcpy(dout_ptr2, dout2, pad_0 * sizeof(float));
-          memcpy(dout_ptr3, dout3, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din4;
-        din1 = din5;
-        din2 = din6;
-        din3 = din7;
-        din4 = din3 + w_in_new;
-        din5 = din4 + w_in_new;
-        din6 = din5 + w_in_new;
-        din7 = din6 + w_in_new;
-
-        dout0 = dout3 + w_out;
-        dout1 = dout0 + w_out;
-        dout2 = dout1 + w_out;
-        dout3 = dout2 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-  free(din_new);
-}
-
-#else
-
-//! kernel for one out without extracting data mid
-//! deal with two lines out
-void compute_one_out_without_extract(const float* din0,
-                                     const float* din1,
-                                     const float* din2,
-                                     const float* din3,
-                                     const float* din4,
-                                     const float* din5,
-                                     float* dout0,
-                                     float* dout1,
-                                     const float* weights,
-                                     const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d6[0]}, [%[din0]]  \n"
-      "vld1.32 {d6[1]}, [%[din1]]  \n"
-      "vld1.32 {d7[0]}, [%[din2]]  \n"
-      "vld1.32 {d7[1]}, [%[din3]]  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d8[0]}, [%[din4]]  \n"
-      "vld1.32 {d8[1]}, [%[din5]]  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights col4
-      "sub %[wh], #64  \n"
-      "vld1.32 {d4[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d4[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d5[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d5[1]}, [%[wh]], r0  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      "vext.32 q5, q3, q4, #1  \n"
-
-      "vmla.f32 q9,  q2, q3  \n"
-      "vmla.f32 q10, q2, q5  \n"
-
-      "vld1.32 {d4[0]}, [%[wh]]  \n"
-      "vld1.32 {d6}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      "vmla.f32 d18, d8, d4[0]  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d6  \n"
-
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11");
-}
-
-//! kernel for one out without extracting data mid
-//! deal with two lines out
-void compute_one_out_without_extract_relu(const float* din0,
-                                          const float* din1,
-                                          const float* din2,
-                                          const float* din3,
-                                          const float* din4,
-                                          const float* din5,
-                                          float* dout0,
-                                          float* dout1,
-                                          const float* weights,
-                                          const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vmov.i32  q15, #0x0  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0 \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0 \n"
-      "vld1.32 {d6[0]}, [%[din0]]   \n"
-      "vld1.32 {d6[1]}, [%[din1]]   \n"
-      "vld1.32 {d7[0]}, [%[din2]]   \n"
-      "vld1.32 {d7[1]}, [%[din3]]   \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d8[0]}, [%[din4]]  \n"
-      "vld1.32 {d8[1]}, [%[din5]]  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights col4
-      "sub %[wh], #64  \n"
-      "vld1.32 {d4[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d4[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d5[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d5[1]}, [%[wh]], r0  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      "vext.32 q5, q3, q4, #1  \n"
-
-      "vmla.f32 q9,  q2, q3  \n"
-      "vmla.f32 q10, q2, q5  \n"
-
-      "vld1.32 {d4[0]}, [%[wh]] \n"
-      "vld1.32 {d6}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      "vmla.f32 d18, d8, d4[0] \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d6   \n"
-
-      // relu
-      "vmax.f32 d18, d18, d30  \n"
-
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q15");
-}
-
-//! kernel for one out without extracting data pre
-//! deal with two lines out
-void compute_one_out_extract_pre(const float* din0,
-                                 const float* din1,
-                                 const float* din2,
-                                 const float* din3,
-                                 const float* din4,
-                                 const float* din5,
-                                 float* dout0,
-                                 float* dout1,
-                                 const float* weights,
-                                 const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #4  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      // load bias
-      "vld1.32 {d0}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d0  \n"
-
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11");
-}
-
-//! kernel for one out without extracting data pre
-//! deal with two lines out
-void compute_one_out_extract_pre_relu(const float* din0,
-                                      const float* din1,
-                                      const float* din2,
-                                      const float* din3,
-                                      const float* din4,
-                                      const float* din5,
-                                      float* dout0,
-                                      float* dout1,
-                                      const float* weights,
-                                      const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #4  \n"
-      "vmov.i32  q15, #0x0  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      // load bias
-      "vld1.32 {d0}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d0  \n"
-
-      // relu
-      "vmax.f32 d18, d18, d30  \n"
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q15");
-}
-
-//! kernel for one out with extracting data post
-//! deal with two lines out
-void compute_one_out_extract_post(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  float* dout0,
-                                  float* dout1,
-                                  const float* weights,
-                                  const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      "vld1.32 {d0}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d0  \n"
-
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11");
-}
-
-//! kernel for one out with extracting data post
-//! deal with two lines out
-void compute_one_out_extract_post_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       float* dout0,
-                                       float* dout1,
-                                       const float* weights,
-                                       const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vmov.i32  q15, #0x0  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      "vld1.32 {d0}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d0  \n"
-
-      // relu
-      "vmax.f32 d18, d18, d30  \n"
-
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q15");
-}
-
-//! kernel for two out with extracting data pre
-//! deal with two lines out
-void compute_two_out_extract_pre(const float* din0,
-                                 const float* din1,
-                                 const float* din2,
-                                 const float* din3,
-                                 const float* din4,
-                                 const float* din5,
-                                 float* dout0,
-                                 float* dout1,
-                                 const float* weights,
-                                 const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "mov r1, #0  \n"
-      "add %[wh], #8  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-      "vpadd.f32 d23, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d22, d22, d23  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]] \n"
-      "vst1.32 {d23}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for two out with extracting data pre
-//! deal with two lines out
-void compute_two_out_extract_pre_relu(const float* din0,
-                                      const float* din1,
-                                      const float* din2,
-                                      const float* din3,
-                                      const float* din4,
-                                      const float* din5,
-                                      float* dout0,
-                                      float* dout1,
-                                      const float* weights,
-                                      const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "mov r1, #0  \n"
-      "add %[wh], #8  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-      "vpadd.f32 d23, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d22, d22, d23  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32  q9, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q9  \n"
-      // store result
-      "vst1.32 {d22}, [%[dout0]] \n"
-      "vst1.32 {d23}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for two out with extracting data post
-//! deal with two lines out
-void compute_two_out_extract_post(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  float* dout0,
-                                  float* dout1,
-                                  const float* weights,
-                                  const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      //! out zero
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-      "vpadd.f32 d23, d20, d21  \n"
-      "vpadd.f32 d22, d22, d23  \n"
-
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-      "vext.32 q8, q8, q15, #1 \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]] \n"
-      "vst1.32 {d23}, [%[dout1]] \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for two out with extracting data post
-//! deal with two lines out
-void compute_two_out_extract_post_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       float* dout0,
-                                       float* dout1,
-                                       const float* weights,
-                                       const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      //! out zero
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-      "vpadd.f32 d23, d20, d21  \n"
-      "vpadd.f32 d22, d22, d23  \n"
-
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-      "vext.32 q8, q8, q15, #1 \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32  q9, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q9  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]] \n"
-      "vst1.32 {d23}, [%[dout1]] \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data pre
-//! deal with two lines out
-void compute_three_out_extract_pre(const float* din0,
-                                   const float* din1,
-                                   const float* din2,
-                                   const float* din3,
-                                   const float* din4,
-                                   const float* din5,
-                                   float* dout0,
-                                   float* dout1,
-                                   const float* weights,
-                                   const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #12  \n"
-      "vld1.32 {d0}, [%[wh]], r0  \n"
-      "vld1.32 {d2}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out zero
-      // weights r0
-      "vmul.f32 d18, d0, d4  \n"
-      "vmul.f32 d20, d0, d6  \n"
-
-      "vld1.32 {d24}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 d18, d2, d6  \n"
-      "vmla.f32 d20, d2, d8  \n"
-
-      "vld1.32 {d26}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 d18, d24, d8  \n"
-      "vmla.f32 d20, d24, d10  \n"
-
-      "vld1.32 {d28}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 d18, d26, d10  \n"
-      "vmla.f32 d20, d26, d12  \n"
-
-      // load bias
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      // weights r4
-      "vmla.f32 d18, d28, d12  \n"
-      "vmla.f32 d20, d28, d14  \n"
-      "vpadd.f32 d22, d18, d20 \n"
-
-      //! out one
-      "mov r1, #0  \n"
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]! \n"
-      "vst1.32 {d23}, [%[dout1]]!  \n"
-
-      //! out two
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d30  \n"
-
-      // store result
-      "vst1.32 {d18[0]}, [%[dout0]] \n"
-      "vst1.32 {d18[1]}, [%[dout1]] \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data pre
-//! deal with two lines out
-void compute_three_out_extract_pre_relu(const float* din0,
-                                        const float* din1,
-                                        const float* din2,
-                                        const float* din3,
-                                        const float* din4,
-                                        const float* din5,
-                                        float* dout0,
-                                        float* dout1,
-                                        const float* weights,
-                                        const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #12  \n"
-      "vld1.32 {d0}, [%[wh]], r0  \n"
-      "vld1.32 {d2}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out zero
-      // weights r0
-      "vmul.f32 d18, d0, d4  \n"
-      "vmul.f32 d20, d0, d6  \n"
-
-      "vld1.32 {d24}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 d18, d2, d6  \n"
-      "vmla.f32 d20, d2, d8  \n"
-
-      "vld1.32 {d26}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 d18, d24, d8  \n"
-      "vmla.f32 d20, d24, d10  \n"
-
-      "vld1.32 {d28}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 d18, d26, d10  \n"
-      "vmla.f32 d20, d26, d12  \n"
-
-      // load bias
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      // weights r4
-      "vmla.f32 d18, d28, d12  \n"
-      "vmla.f32 d20, d28, d14  \n"
-      "vpadd.f32 d22, d18, d20 \n"
-
-      //! out one
-      "mov r1, #0  \n"
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32  q8, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q8  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]! \n"
-      "vst1.32 {d23}, [%[dout1]]!  \n"
-
-      //! out two
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d30  \n"
-
-      // relu
-      "vmax.f32 d18, d18, d16  \n"
-
-      // store result
-      "vst1.32 {d18[0]}, [%[dout0]] \n"
-      "vst1.32 {d18[1]}, [%[dout1]] \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data post
-//! deal with two lines out
-void compute_three_out_extract_post(const float* din0,
-                                    const float* din1,
-                                    const float* din2,
-                                    const float* din3,
-                                    const float* din4,
-                                    const float* din5,
-                                    float* dout0,
-                                    float* dout1,
-                                    const float* weights,
-                                    const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out zero && two
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-      "vmul.f32 d16, d0, d5  \n"
-      "vmul.f32 d17, d0, d7  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-      "vmla.f32 d16, d2, d7  \n"
-      "vmla.f32 d17, d2, d9  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-      "vmla.f32 d16, d24, d9  \n"
-      "vmla.f32 d17, d24, d11 \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-      "vmla.f32 d16, d26, d11 \n"
-      "vmla.f32 d17, d26, d13 \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vmla.f32 d16, d28, d13 \n"
-      "vmla.f32 d17, d28, d15 \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d16, d16, d17  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      // load bias
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32  q9, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-      "vadd.f32 d16, d16, d30  \n"
-
-      "vst1.32 {d22},    [%[dout0]]!  \n"
-      "vst1.32 {d23},    [%[dout1]]!  \n"
-      "vst1.32 {d16[0]}, [%[dout0]]!  \n"
-      "vst1.32 {d16[1]}, [%[dout1]]!  \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data post
-//! deal with two lines out
-void compute_three_out_extract_post_relu(const float* din0,
-                                         const float* din1,
-                                         const float* din2,
-                                         const float* din3,
-                                         const float* din4,
-                                         const float* din5,
-                                         float* dout0,
-                                         float* dout1,
-                                         const float* weights,
-                                         const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out zero && two
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-      "vmul.f32 d16, d0, d5  \n"
-      "vmul.f32 d17, d0, d7  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-      "vmla.f32 d16, d2, d7  \n"
-      "vmla.f32 d17, d2, d9  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-      "vmla.f32 d16, d24, d9  \n"
-      "vmla.f32 d17, d24, d11 \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-      "vmla.f32 d16, d26, d11 \n"
-      "vmla.f32 d17, d26, d13 \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vmla.f32 d16, d28, d13 \n"
-      "vmla.f32 d17, d28, d15 \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d16, d16, d17  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      // load bias
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32 q9, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-      "vadd.f32 d16, d16, d30  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q9  \n"
-      "vmax.f32 d16, d16, d18 \n"
-
-      "vst1.32 {d22},    [%[dout0]]!  \n"
-      "vst1.32 {d23},    [%[dout1]]!  \n"
-      "vst1.32 {d16[0]}, [%[dout0]]!  \n"
-      "vst1.32 {d16[1]}, [%[dout1]]!  \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for four out with extracting data pre
-//! deal with two lines out
-void compute_four_out_extract_pre(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  float* dout0,
-                                  float* dout1,
-                                  const float* weights,
-                                  const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #16  \n"
-
-      //! out zero
-      // load input
-      "vld1.32 {d4[0]}, [%[din0]]  \n"
-      "vld1.32 {d4[1]}, [%[din1]]  \n"
-      "vld1.32 {d5[0]}, [%[din2]]  \n"
-      "vld1.32 {d5[1]}, [%[din3]]  \n"
-      "vld1.32 {d6[0]}, [%[din4]]  \n"
-      "vld1.32 {d6[1]}, [%[din5]]  \n"
-
-      "vext.32 q4, q2, q3, #1  \n"
-
-      // load weights
-      "vld1.32 d0[0], [%[wh]], r0  \n"
-      "vld1.32 d0[1], [%[wh]], r0 \n"
-      "vld1.32 d1[0], [%[wh]], r0  \n"
-      "vld1.32 d1[1], [%[wh]], r0  \n"
-      "vld1.32 d2[0], [%[wh]]\n"
-
-      "vmul.f32 q9, q0, q2  \n"
-      "vmul.f32 q10, q0, q4 \n"
-
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      "vmla.f32 d22, d6, d2[0]  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0}, [%[wh]], r0  \n"
-      "vld1.32 {d2}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 d18, d0, d4  \n"
-      "vmul.f32 d20, d0, d6  \n"
-
-      "vld1.32 {d24}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 d18, d2, d6  \n"
-      "vmla.f32 d20, d2, d8  \n"
-
-      "vld1.32 {d26}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 d18, d24, d8  \n"
-      "vmla.f32 d20, d24, d10  \n"
-
-      "vld1.32 {d28}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 d18, d26, d10  \n"
-      "vmla.f32 d20, d26, d12  \n"
-
-      // weights r4
-      "vmla.f32 d18, d28, d12  \n"
-      "vmla.f32 d20, d28, d14  \n"
-
-      "vpadd.f32 d23, d18, d20 \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]!  \n"
-      "vst1.32 {d23}, [%[dout1]]!  \n"
-
-      //! out two
-      "mov r1, #0  \n"
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-
-      //! out three
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]  \n"
-      "vst1.32 {d23}, [%[dout1]]  \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for four out with extracting data pre
-//! deal with two lines out
-void compute_four_out_extract_pre_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       float* dout0,
-                                       float* dout1,
-                                       const float* weights,
-                                       const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #16  \n"
-
-      //! out zero
-      // load input
-      "vld1.32 {d4[0]}, [%[din0]]  \n"
-      "vld1.32 {d4[1]}, [%[din1]]  \n"
-      "vld1.32 {d5[0]}, [%[din2]]  \n"
-      "vld1.32 {d5[1]}, [%[din3]]  \n"
-      "vld1.32 {d6[0]}, [%[din4]]  \n"
-      "vld1.32 {d6[1]}, [%[din5]]  \n"
-
-      "vext.32 q4, q2, q3, #1  \n"
-
-      // load weights
-      "vld1.32 d0[0], [%[wh]], r0  \n"
-      "vld1.32 d0[1], [%[wh]], r0 \n"
-      "vld1.32 d1[0], [%[wh]], r0  \n"
-      "vld1.32 d1[1], [%[wh]], r0  \n"
-      "vld1.32 d2[0], [%[wh]]\n"
-
-      "vmul.f32 q9, q0, q2  \n"
-      "vmul.f32 q10, q0, q4 \n"
-
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      "vmla.f32 d22, d6, d2[0]  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0}, [%[wh]], r0  \n"
-      "vld1.32 {d2}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 d18, d0, d4  \n"
-      "vmul.f32 d20, d0, d6  \n"
-
-      "vld1.32 {d24}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 d18, d2, d6  \n"
-      "vmla.f32 d20, d2, d8  \n"
-
-      "vld1.32 {d26}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 d18, d24, d8  \n"
-      "vmla.f32 d20, d24, d10  \n"
-
-      "vld1.32 {d28}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 d18, d26, d10  \n"
-      "vmla.f32 d20, d26, d12  \n"
-
-      // weights r4
-      "vmla.f32 d18, d28, d12  \n"
-      "vmla.f32 d20, d28, d14  \n"
-
-      "vpadd.f32 d23, d18, d20 \n"
-      "vmov.i32 q8, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q8  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]!  \n"
-      "vst1.32 {d23}, [%[dout1]]!  \n"
-
-      //! out two
-      "mov r1, #0  \n"
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-
-      //! out three
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q8  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]  \n"
-      "vst1.32 {d23}, [%[dout1]]  \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data post
-//! deal with two lines out
-void compute_four_out_extract_post(const float* din0,
-                                   const float* din1,
-                                   const float* din2,
-                                   const float* din3,
-                                   const float* din4,
-                                   const float* din5,
-                                   float* dout0,
-                                   float* dout1,
-                                   const float* weights,
-                                   const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "mov r1, #12  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]], r1  \n"
-      "vld1.32 {d6-d7},   [%[din1]], r1  \n"
-      "vld1.32 {d8-d9},   [%[din2]], r1  \n"
-      "vld1.32 {d10-d11}, [%[din3]], r1  \n"
-      "vld1.32 {d12-d13}, [%[din4]], r1  \n"
-      "vld1.32 {d14-d15}, [%[din5]], r1  \n"
-
-      //! out zero && two
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-      "vmul.f32 d16, d0, d5  \n"
-      "vmul.f32 d17, d0, d7  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-      "vmla.f32 d16, d2, d7  \n"
-      "vmla.f32 d17, d2, d9  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-      "vmla.f32 d16, d24, d9  \n"
-      "vmla.f32 d17, d24, d11 \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-      "vmla.f32 d16, d26, d11 \n"
-      "vmla.f32 d17, d26, d13 \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vmla.f32 d16, d28, d13 \n"
-      "vmla.f32 d17, d28, d15 \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d16, d16, d17  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      //! out one
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]! \n"
-      "vst1.32 {d23}, [%[dout1]]! \n"
-
-      //! out three
-      "sub %[wh], #80  \n"
-      "vld1.32 {d4[0]}, [%[din0]]  \n"
-      "vld1.32 {d4[1]}, [%[din1]]  \n"
-      "vld1.32 {d5[0]}, [%[din2]]  \n"
-      "vld1.32 {d5[1]}, [%[din3]]  \n"
-      "vld1.32 {d6[0]}, [%[din4]]  \n"
-      "vld1.32 {d6[1]}, [%[din5]]  \n"
-
-      "vext.32 q4, q2, q3, #1  \n"
-
-      "vld1.32 {d0[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d0[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d1[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d1[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d2[0]}, [%[wh]]      \n"
-
-      "vmul.f32 q9, q0, q2   \n"
-      "vmul.f32 q10, q0, q4  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d20, d20, d21  \n"
-      "vpadd.f32 d17, d18, d20  \n"
-
-      "vmla.f32 d17, d6, d2[0]  \n"
-
-      // trn out neon register
-      "vtrn.32 d16, d17  \n"
-
-      // add bias
-      "vadd.f32 q8, q8, q15  \n"
-
-      // store result
-      "vst1.32 {d16}, [%[dout0]]  \n"
-      "vst1.32 {d17}, [%[dout1]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data post
-//! deal with two lines out
-void compute_four_out_extract_post_relu(const float* din0,
-                                        const float* din1,
-                                        const float* din2,
-                                        const float* din3,
-                                        const float* din4,
-                                        const float* din5,
-                                        float* dout0,
-                                        float* dout1,
-                                        const float* weights,
-                                        const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "mov r1, #12  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]], r1  \n"
-      "vld1.32 {d6-d7},   [%[din1]], r1  \n"
-      "vld1.32 {d8-d9},   [%[din2]], r1  \n"
-      "vld1.32 {d10-d11}, [%[din3]], r1  \n"
-      "vld1.32 {d12-d13}, [%[din4]], r1  \n"
-      "vld1.32 {d14-d15}, [%[din5]], r1  \n"
-
-      //! out zero && two
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-      "vmul.f32 d16, d0, d5  \n"
-      "vmul.f32 d17, d0, d7  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-      "vmla.f32 d16, d2, d7  \n"
-      "vmla.f32 d17, d2, d9  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-      "vmla.f32 d16, d24, d9  \n"
-      "vmla.f32 d17, d24, d11 \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-      "vmla.f32 d16, d26, d11 \n"
-      "vmla.f32 d17, d26, d13 \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vmla.f32 d16, d28, d13 \n"
-      "vmla.f32 d17, d28, d15 \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d16, d16, d17  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      //! out one
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32 q5, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q5  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]! \n"
-      "vst1.32 {d23}, [%[dout1]]! \n"
-
-      //! out three
-      "sub %[wh], #80  \n"
-      "vld1.32 {d4[0]}, [%[din0]]  \n"
-      "vld1.32 {d4[1]}, [%[din1]]  \n"
-      "vld1.32 {d5[0]}, [%[din2]]  \n"
-      "vld1.32 {d5[1]}, [%[din3]]  \n"
-      "vld1.32 {d6[0]}, [%[din4]]  \n"
-      "vld1.32 {d6[1]}, [%[din5]]  \n"
-
-      "vext.32 q4, q2, q3, #1  \n"
-
-      "vld1.32 {d0[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d0[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d1[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d1[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d2[0]}, [%[wh]]      \n"
-
-      "vmul.f32 q9, q0, q2   \n"
-      "vmul.f32 q10, q0, q4  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d20, d20, d21  \n"
-      "vpadd.f32 d17, d18, d20  \n"
-
-      "vmla.f32 d17, d6, d2[0]  \n"
-
-      // trn out neon register
-      "vtrn.32 d16, d17  \n"
-
-      // add bias
-      "vadd.f32 q8, q8, q15  \n"
-
-      // relu
-      "vmax.f32 q8, q8, q5  \n"
-
-      // store result
-      "vst1.32 {d16}, [%[dout0]]  \n"
-      "vst1.32 {d17}, [%[dout1]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-void conv_depthwise_5x5s1_impl(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
+#define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b))
+#ifdef __aarch64__
+void conv_depthwise_5x5s1_fp32(float* dout,
+                               const float* din,
                                const float* weights,
                                const float* bias,
-                               int pad,
                                bool flag_bias,
                                bool flag_relu,
+                               int num,
+                               int chin,
+                               int hin,
+                               int win,
+                               int hout,
+                               int wout,
+                               int padw,
+                               int padh,
+                               const operators::ConvParam& param,
                                ARMContext* ctx) {
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int mid_out = w_out - 2 * pad;
-  int mid_cnt = mid_out >> 2;
-  int mid_remain = mid_out - (mid_cnt << 2);
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_c);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_c;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      const float* din_list[6];
-      //! set din ptr with zero buffer
-      for (int i = 0; i < pad_new; ++i) {
-        din_list[i] = zero_ptr;
-      }
-      //! set din ptr with input data
-      for (int i = pad_new; i < 6; ++i) {
-        din_list[i] = din_ch;
-        din_ch += w_in;
-      }
-      //! every h loop, deal with 6 line input
-      const float* din0 = din_list[0];
-      const float* din1 = din_list[1];
-      const float* din2 = din_list[2];
-      const float* din3 = din_list[3];
-      const float* din4 = din_list[4];
-      const float* din5 = din_list[5];
-
-      //! every h loop, deal with 2 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 2) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 6 - pad_new > h_in) {
-          switch (h + 6 - pad_new - h_in) {
-            case 5:
-              din1 = zero_ptr;
-            case 4:
-              din2 = zero_ptr;
-            case 3:
-              din3 = zero_ptr;
-            case 2:
-              din4 = zero_ptr;
-            case 1:
-              din5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 2 > h_out_new) {
-          dout1 = write_ptr;
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_c;
-            *dout_ptr1++ = bias_c;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-        }
-
-        //! deal with w_out pad_new column pre
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_pre(din_ptr0,
-                                         din_ptr1,
-                                         din_ptr2,
-                                         din_ptr3,
-                                         din_ptr4,
-                                         din_ptr5,
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         weights_c,
-                                         vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_pre(din_ptr0,
-                                          din_ptr1,
-                                          din_ptr2,
-                                          din_ptr3,
-                                          din_ptr4,
-                                          din_ptr5,
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          weights_c,
-                                          vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_pre(din_ptr0,
-                                        din_ptr1,
-                                        din_ptr2,
-                                        din_ptr3,
-                                        din_ptr4,
-                                        din_ptr5,
-                                        dout_ptr0,
-                                        dout_ptr1,
-                                        weights_c,
-                                        vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_pre(din_ptr0,
-                                        din_ptr1,
-                                        din_ptr2,
-                                        din_ptr3,
-                                        din_ptr4,
-                                        din_ptr5,
-                                        dout_ptr0,
-                                        dout_ptr1,
-                                        weights_c,
-                                        vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            break;
-        }
-
-        //! mid loop
-        if (mid_cnt > 0) {
-          int mid_loop = mid_cnt;
-          const float* weights_ptr = weights_c;
-          asm volatile(
-              //! din: q7-q12
-              //! dout: q13, q14
-              "mov r1, #20  \n"
-              //! load weights
-              "vld1.32 {d0-d1}, [%[wh]], r1  \n"
-              "vld1.32 {d2-d3}, [%[wh]], r1  \n"
-              "vld1.32 {d4-d5}, [%[wh]], r1  \n"
-              "vld1.32 {d6-d7}, [%[wh]], r1  \n"
-              "vld1.32 {d8-d9}, [%[wh]]  \n"
-
-              "sub %[wh], #64  \n"
-              "vld1.32 {d10[0]}, [%[wh]], r1  \n"
-              "vld1.32 {d10[1]}, [%[wh]], r1  \n"
-              "vld1.32 {d11[0]}, [%[wh]], r1  \n"
-              "vld1.32 {d11[1]}, [%[wh]], r1  \n"
-              "vld1.32 {d12[0]}, [%[wh]]      \n"
-
-              //! load input
-              "mov r1, #4  \n"
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              //! load bias
-              "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-              "1: \n"
-              //! add bias to output
-              "vmov.32 q13, q15 \n"
-              "vmov.32 q14, q15 \n"
-
-              "pld [%[din0]]  \n"
-              "pld [%[din1]]  \n"
-              "pld [%[din2]]  \n"
-              "pld [%[din3]]  \n"
-              "pld [%[din4]]  \n"
-              "pld [%[din5]]  \n"
-
-              // weights col 0
-              "vmla.f32 q13, q7, d0[0]  \n"
-              "vmla.f32 q14, q8, d0[0]  \n"
-
-              "vmla.f32 q13, q8, d2[0]  \n"
-              "vmla.f32 q14, q9, d2[0]  \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d4[0]  \n"
-              "vmla.f32 q14, q10, d4[0]  \n"
-
-              "vmla.f32 q13, q10, d6[0]  \n"
-              "vmla.f32 q14, q11, d6[0]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d8[0]  \n"
-              "vmla.f32 q14, q12, d8[0]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 1
-              "vmla.f32 q13, q7, d0[1]  \n"
-              "vmla.f32 q14, q8, d0[1]  \n"
-
-              "vmla.f32 q13, q8, d2[1]   \n"
-              "vmla.f32 q14, q9, d2[1]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d4[1]  \n"
-              "vmla.f32 q14, q10, d4[1]  \n"
-
-              "vmla.f32 q13, q10, d6[1]  \n"
-              "vmla.f32 q14, q11, d6[1]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d8[1]  \n"
-              "vmla.f32 q14, q12, d8[1]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 2
-              "vmla.f32 q13, q7, d1[0]  \n"
-              "vmla.f32 q14, q8, d1[0]  \n"
-
-              "vmla.f32 q13, q8, d3[0]   \n"
-              "vmla.f32 q14, q9, d3[0]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d5[0]  \n"
-              "vmla.f32 q14, q10, d5[0]  \n"
-
-              "vmla.f32 q13, q10, d7[0]  \n"
-              "vmla.f32 q14, q11, d7[0]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d9[0]  \n"
-              "vmla.f32 q14, q12, d9[0]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 3
-              "vmla.f32 q13, q7, d1[1]  \n"
-              "vmla.f32 q14, q8, d1[1]  \n"
-
-              "vmla.f32 q13, q8, d3[1]   \n"
-              "vmla.f32 q14, q9, d3[1]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d5[1]  \n"
-              "vmla.f32 q14, q10, d5[1]  \n"
-
-              "vmla.f32 q13, q10, d7[1]  \n"
-              "vmla.f32 q14, q11, d7[1]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d9[1]  \n"
-              "vmla.f32 q14, q12, d9[1]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 4
-              "vmla.f32 q13, q7, d10[0]  \n"
-              "vmla.f32 q14, q8, d10[0]  \n"
-
-              "vmla.f32 q13, q8,  d10[1]   \n"
-              "vmla.f32 q14, q9, d10[1]   \n"
-
-              "vmla.f32 q13, q9, d11[0]  \n"
-              "vmla.f32 q14, q10, d11[0]  \n"
-
-              "vmla.f32 q13, q10, d11[1]  \n"
-              "vmla.f32 q14, q11, d11[1]  \n"
-
-              "vmla.f32 q13, q11, d12[0]   \n"
-              "vmla.f32 q14, q12, d12[0]  \n"
-
-              // store reslult
-              "vst1.32 {d26-d27}, [%[out0]]! \n"
-              "vst1.32 {d28-d29}, [%[out1]]! \n"
-
-              "subs %[cnt], #1  \n"
-              "bne 1b  \n"
-
-              "sub %[din0], r1  \n"
-              "sub %[din1], r1  \n"
-              "sub %[din2], r1  \n"
-              "sub %[din3], r1  \n"
-              "sub %[din4], r1  \n"
-              "sub %[din5], r1  \n"
-
-              : [din0] "+r"(din_ptr0),
-                [din1] "+r"(din_ptr1),
-                [din2] "+r"(din_ptr2),
-                [din3] "+r"(din_ptr3),
-                [din4] "+r"(din_ptr4),
-                [din5] "+r"(din_ptr5),
-                [out0] "+r"(dout_ptr0),
-                [out1] "+r"(dout_ptr1),
-                [wh] "+r"(weights_ptr),
-                [cnt] "+r"(mid_loop)
-              : [bias] "r"(vbias)
-              : "cc",
-                "memory",
-                "r1",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-        }
-        //! deal with mid remain
-        for (int i = 0; i < mid_remain; ++i) {
-          compute_one_out_without_extract(din_ptr0,
-                                          din_ptr1,
-                                          din_ptr2,
-                                          din_ptr3,
-                                          din_ptr4,
-                                          din_ptr5,
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          weights_c,
-                                          vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-        }
-        //! deal with w_out pad_new column post
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_post(din_ptr0,
-                                          din_ptr1,
-                                          din_ptr2,
-                                          din_ptr3,
-                                          din_ptr4,
-                                          din_ptr5,
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          weights_c,
-                                          vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_post(din_ptr0,
-                                           din_ptr1,
-                                           din_ptr2,
-                                           din_ptr3,
-                                           din_ptr4,
-                                           din_ptr5,
-                                           dout_ptr0,
-                                           dout_ptr1,
-                                           weights_c,
-                                           vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_post(din_ptr0,
-                                         din_ptr1,
-                                         din_ptr2,
-                                         din_ptr3,
-                                         din_ptr4,
-                                         din_ptr5,
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         weights_c,
-                                         vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_post(din_ptr0,
-                                         din_ptr1,
-                                         din_ptr2,
-                                         din_ptr3,
-                                         din_ptr4,
-                                         din_ptr5,
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         weights_c,
-                                         vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            break;
-        }
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din5;
-        din4 = din3 + w_in;
-        din5 = din4 + w_in;
-
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-}
-
-void conv_depthwise_5x5s1_relu_impl(const float* din,
-                                    float* dout,
-                                    int num,
-                                    int ch_out,
-                                    int h_out,
-                                    int w_out,
-                                    int ch_in,
-                                    int h_in,
-                                    int w_in,
-                                    const float* weights,
-                                    const float* bias,
-                                    int pad,
-                                    bool flag_bias,
-                                    bool flag_relu,
-                                    ARMContext* ctx) {
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int mid_out = w_out - 2 * pad;
-  int mid_cnt = mid_out >> 2;
-  int mid_remain = mid_out - (mid_cnt << 2);
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
+  const int threads = ctx->threads();
+  int llc_size = ctx->llc_size() / 4;
+  auto act_param = param.activation_param;
+  const int hout_c_block = 4;
+  const int hout_r_kernel = 2;
+  const int wout_block = 4;
+  const int wout_round = ((wout + wout_block - 1) / wout_block) * wout_block;
+  const int win_round = wout_round + 4;
+
+  //! get h block
+  //! llc_size = threads * win_round * hout_c_block * hin_r_block *
+  //! sizeof(float)
+  //! + wout_round * hout_c_block * hout_r_block * threads * sizeof(float)
+  //! win_round = wout_round + 4
+  //! hin_r_block = hout_r_block + 4
+  int hout_r_block = (llc_size - 16 * win_round * hout_c_block * threads) /
+                     (win_round * hout_c_block * threads * 4 +
+                      hout_c_block * wout_round * threads * 4);
+  hout_r_block = hout_r_block > hout ? hout : hout_r_block;
+  hout_r_block =
+      ((hout_r_block + hout_r_kernel - 1) / hout_r_kernel) * hout_r_kernel;
+  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
+
+  const int hin_r_block = hout_r_block + 4;
+
+  float* tmp_work_space = ctx->workspace_data<float>();
+  float ptr_zero[win_round];  // NOLINT
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float ptr_write[wout_round];  // NOLINT
+
+  int in_len = win_round * hout_c_block;
+  int pre_in_size = hin_r_block * in_len;
+  pre_in_size = ROUNDUP(pre_in_size, 4);
+  int pre_out_size = hout_c_block * hout_r_block * wout_round;
+
+  float* tmp_din = tmp_work_space;
+
+  int size_in_channel = win * hin;
+  int size_out_channel = wout * hout;
+  int w_stride = 25;  // kernel_w * kernel_h;
+
+  int ws = -padw;
+  int we = ws + win_round;
+  int w_loop = wout_round / 4;
+  int chout = chin;
+
+  int out_row_stride = hout_c_block * wout_round;
   for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float bias_relu = bias_c > 0.f ? bias_c : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_relu);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_relu;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      const float* din_list[6];
-      //! set din ptr with zero buffer
-      for (int i = 0; i < pad_new; ++i) {
-        din_list[i] = zero_ptr;
-      }
-      //! set din ptr with input data
-      for (int i = pad_new; i < 6; ++i) {
-        din_list[i] = din_ch;
-        din_ch += w_in;
-      }
-      //! every h loop, deal with 6 line input
-      const float* din0 = din_list[0];
-      const float* din1 = din_list[1];
-      const float* din2 = din_list[2];
-      const float* din3 = din_list[3];
-      const float* din4 = din_list[4];
-      const float* din5 = din_list[5];
-
-      //! every h loop, deal with 2 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 2) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 6 - pad_new > h_in) {
-          switch (h + 6 - pad_new - h_in) {
-            case 5:
-              din1 = zero_ptr;
-            case 4:
-              din2 = zero_ptr;
-            case 3:
-              din3 = zero_ptr;
-            case 2:
-              din4 = zero_ptr;
-            case 1:
-              din5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 2 > h_out_new) {
-          dout1 = write_ptr;
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
+    const float* din_batch = din + n * chin * size_in_channel;
+    float* dout_batch = dout + n * chout * size_out_channel;
+    for (int h = 0; h < hout; h += hout_r_block) {
+      int h_kernel = hout_r_block;
+      if (h + hout_r_block > hout) {
+        h_kernel = hout - h;
+      }
+      int hs = h - padh;
+      int he = hs + h_kernel + 4;
+
+#pragma omp parallel for num_threads(threads)
+      for (int c = 0; c < chout; c += hout_c_block) {
+#ifdef ARM_WITH_OMP
+        float* pre_din =
+            tmp_din + omp_get_thread_num() * (pre_in_size + pre_out_size);
+        float* pre_out = pre_din + pre_in_size;
+#else
+        float* pre_din = tmp_din;
+        float* pre_out = pre_din + pre_in_size;
+#endif
+        prepack_input_nxwc4_dw(
+            din_batch, pre_din, c, hs, he, ws, we, chin, win, hin, ptr_zero);
+        const float* block_inr0 = pre_din;
+        const float* block_inr1 = block_inr0 + in_len;
+        const float* block_inr2 = block_inr1 + in_len;
+        const float* block_inr3 = block_inr2 + in_len;
+        const float* block_inr4 = block_inr3 + in_len;
+        const float* block_inr5 = block_inr4 + in_len;
+
+        const float* weight_c = weights + c * w_stride;
+        float bias_local[4] = {0, 0, 0, 0};
         if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_relu;
-            *dout_ptr1++ = bias_relu;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-        }
-
-        //! deal with w_out pad_new column pre
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_pre_relu(din_ptr0,
-                                              din_ptr1,
-                                              din_ptr2,
-                                              din_ptr3,
-                                              din_ptr4,
-                                              din_ptr5,
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              weights_c,
-                                              vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_pre_relu(din_ptr0,
-                                               din_ptr1,
-                                               din_ptr2,
-                                               din_ptr3,
-                                               din_ptr4,
-                                               din_ptr5,
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               weights_c,
-                                               vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_pre_relu(din_ptr0,
-                                             din_ptr1,
-                                             din_ptr2,
-                                             din_ptr3,
-                                             din_ptr4,
-                                             din_ptr5,
-                                             dout_ptr0,
-                                             dout_ptr1,
-                                             weights_c,
-                                             vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_pre_relu(din_ptr0,
-                                             din_ptr1,
-                                             din_ptr2,
-                                             din_ptr3,
-                                             din_ptr4,
-                                             din_ptr5,
-                                             dout_ptr0,
-                                             dout_ptr1,
-                                             weights_c,
-                                             vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            break;
-        }
-
-        //! mid loop
-        if (mid_cnt > 0) {
-          int mid_loop = mid_cnt;
-          const float* weights_ptr = weights_c;
+          bias_local[0] = bias[c];
+          bias_local[1] = bias[c + 1];
+          bias_local[2] = bias[c + 2];
+          bias_local[3] = bias[c + 3];
+        }
+        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
+          int cnt = w_loop;
+          const float* inr0 = block_inr0;
+          const float* inr1 = block_inr1;
+          const float* inr2 = block_inr2;
+          const float* inr3 = block_inr3;
+          const float* inr4 = block_inr4;
+          const float* inr5 = block_inr5;
+
+          float* ptr_out0 = pre_out + hk * out_row_stride;
+          float* ptr_out1 = ptr_out0 + out_row_stride;
+          // clang-format off
+          auto wptr = weight_c;
           asm volatile(
-              //! din: q7-q12
-              //! dout: q13, q14
-              "mov r1, #20  \n"
-              "vmov.i32 q15, #0x0  \n"
-              //! load weights
-              "vld1.32 {d0-d1}, [%[wh]], r1  \n"
-              "vld1.32 {d2-d3}, [%[wh]], r1  \n"
-              "vld1.32 {d4-d5}, [%[wh]], r1  \n"
-              "vld1.32 {d6-d7}, [%[wh]], r1  \n"
-              "vld1.32 {d8-d9}, [%[wh]]  \n"
-
-              "sub %[wh], #64  \n"
-              "vld1.32 {d10[0]}, [%[wh]], r1  \n"
-              "vld1.32 {d10[1]}, [%[wh]], r1  \n"
-              "vld1.32 {d11[0]}, [%[wh]], r1  \n"
-              "vld1.32 {d11[1]}, [%[wh]], r1  \n"
-              "vld1.32 {d12[0]}, [%[wh]]      \n"
-
-              //! load input
-              "mov r1, #4  \n"
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              "1: \n"
-
-              //! load bias to output
-              "vld1.32 {d26-d27}, [%[bias]] \n"
-              "vld1.32 {d28-d29}, [%[bias]] \n"
-
-              "pld [%[din0]]  \n"
-              "pld [%[din1]]  \n"
-              "pld [%[din2]]  \n"
-              "pld [%[din3]]  \n"
-              "pld [%[din4]]  \n"
-              "pld [%[din5]]  \n"
-
-              // weights col 0
-              "vmla.f32 q13, q7, d0[0]  \n"
-              "vmla.f32 q14, q8, d0[0]  \n"
-
-              "vmla.f32 q13, q8, d2[0]  \n"
-              "vmla.f32 q14, q9, d2[0]  \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d4[0]  \n"
-              "vmla.f32 q14, q10, d4[0]  \n"
-
-              "vmla.f32 q13, q10, d6[0]  \n"
-              "vmla.f32 q14, q11, d6[0]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d8[0]  \n"
-              "vmla.f32 q14, q12, d8[0]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 1
-              "vmla.f32 q13, q7, d0[1]  \n"
-              "vmla.f32 q14, q8, d0[1]  \n"
-
-              "vmla.f32 q13, q8, d2[1]   \n"
-              "vmla.f32 q14, q9, d2[1]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d4[1]  \n"
-              "vmla.f32 q14, q10, d4[1]  \n"
-
-              "vmla.f32 q13, q10, d6[1]  \n"
-              "vmla.f32 q14, q11, d6[1]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d8[1]  \n"
-              "vmla.f32 q14, q12, d8[1]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 2
-              "vmla.f32 q13, q7, d1[0]  \n"
-              "vmla.f32 q14, q8, d1[0]  \n"
-
-              "vmla.f32 q13, q8, d3[0]   \n"
-              "vmla.f32 q14, q9, d3[0]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d5[0]  \n"
-              "vmla.f32 q14, q10, d5[0]  \n"
-
-              "vmla.f32 q13, q10, d7[0]  \n"
-              "vmla.f32 q14, q11, d7[0]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d9[0]  \n"
-              "vmla.f32 q14, q12, d9[0]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 3
-              "vmla.f32 q13, q7, d1[1]  \n"
-              "vmla.f32 q14, q8, d1[1]  \n"
-
-              "vmla.f32 q13, q8, d3[1]   \n"
-              "vmla.f32 q14, q9, d3[1]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d5[1]  \n"
-              "vmla.f32 q14, q10, d5[1]  \n"
-
-              "vmla.f32 q13, q10, d7[1]  \n"
-              "vmla.f32 q14, q11, d7[1]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d9[1]  \n"
-              "vmla.f32 q14, q12, d9[1]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 4
-              "vmla.f32 q13, q7, d10[0]  \n"
-              "vmla.f32 q14, q8, d10[0]  \n"
-
-              "vmla.f32 q13, q8,  d10[1]   \n"
-              "vmla.f32 q14, q9, d10[1]   \n"
-
-              "vmla.f32 q13, q9, d11[0]  \n"
-              "vmla.f32 q14, q10, d11[0]  \n"
-
-              "vmla.f32 q13, q10, d11[1]  \n"
-              "vmla.f32 q14, q11, d11[1]  \n"
-
-              "vmla.f32 q13, q11, d12[0]   \n"
-              "vmla.f32 q14, q12, d12[0]  \n"
-
-              // relu
-              "vmax.f32 q13, q13, q15  \n"
-              "vmax.f32 q14, q14, q15  \n"
-
-              // store result
-              "vst1.32 {d26-d27}, [%[out0]]! \n"
-              "vst1.32 {d28-d29}, [%[out1]]! \n"
-
-              "subs %[cnt], #1  \n"
-              "bne 1b  \n"
-
-              "sub %[din0], r1  \n"
-              "sub %[din1], r1  \n"
-              "sub %[din2], r1  \n"
-              "sub %[din3], r1  \n"
-              "sub %[din4], r1  \n"
-              "sub %[din5], r1  \n"
-
-              : [din0] "+r"(din_ptr0),
-                [din1] "+r"(din_ptr1),
-                [din2] "+r"(din_ptr2),
-                [din3] "+r"(din_ptr3),
-                [din4] "+r"(din_ptr4),
-                [din5] "+r"(din_ptr5),
-                [out0] "+r"(dout_ptr0),
-                [out1] "+r"(dout_ptr1),
-                [wh] "+r"(weights_ptr),
-                [cnt] "+r"(mid_loop)
-              : [bias] "r"(vbias)
-              : "cc",
-                "memory",
-                "r1",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-        }
-        //! deal with mid remain
-        for (int i = 0; i < mid_remain; ++i) {
-          compute_one_out_without_extract_relu(din_ptr0,
-                                               din_ptr1,
-                                               din_ptr2,
-                                               din_ptr3,
-                                               din_ptr4,
-                                               din_ptr5,
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               weights_c,
-                                               vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-        }
-        //! deal with w_out pad_new column post
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_post_relu(din_ptr0,
-                                               din_ptr1,
-                                               din_ptr2,
-                                               din_ptr3,
-                                               din_ptr4,
-                                               din_ptr5,
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               weights_c,
-                                               vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_post_relu(din_ptr0,
-                                                din_ptr1,
-                                                din_ptr2,
-                                                din_ptr3,
-                                                din_ptr4,
-                                                din_ptr5,
-                                                dout_ptr0,
-                                                dout_ptr1,
-                                                weights_c,
-                                                vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_post_relu(din_ptr0,
-                                              din_ptr1,
-                                              din_ptr2,
-                                              din_ptr3,
-                                              din_ptr4,
-                                              din_ptr5,
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              weights_c,
-                                              vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_post_relu(din_ptr0,
-                                              din_ptr1,
-                                              din_ptr2,
-                                              din_ptr3,
-                                              din_ptr4,
-                                              din_ptr5,
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              weights_c,
-                                              vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            break;
-        }
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din5;
-        din4 = din3 + w_in;
-        din5 = din4 + w_in;
-
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-}
-
-void conv_depthwise_5x5s1_small_impl(const float* din,
-                                     float* dout,
-                                     int num,
-                                     int ch_out,
-                                     int h_out,
-                                     int w_out,
-                                     int ch_in,
-                                     int h_in,
-                                     int w_in,
-                                     const float* weights,
-                                     const float* bias,
-                                     int pad,
-                                     bool flag_bias,
-                                     bool flag_relu,
-                                     ARMContext* ctx) {
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_in_new = h_in + 2 * pad_new;
-  int w_in_new = w_in + 2 * pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int w_out_new = w_out - 2 * pad_0;
-  float zero_ptr[w_in_new + w_out];  // NOLINT
-  memset(zero_ptr, 0, w_in_new * sizeof(float));
-  float* write_ptr = zero_ptr + w_in_new;
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in_new * h_in_new;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new);
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din_new + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_c);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_c;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      //! every h loop, deal with 6 line input
-      const float* din0 = din_ch;
-      const float* din1 = din0 + w_in_new;
-      const float* din2 = din1 + w_in_new;
-      const float* din3 = din2 + w_in_new;
-      const float* din4 = din3 + w_in_new;
-      const float* din5 = din4 + w_in_new;
-      //! every h loop, deal with 2 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 2) {
-        //! (h - pad_new) + 6 > h_in - 1
-        if (h + 6 > h_in_new) {
-          switch (h + 6 - h_in_new) {
-            case 5:
-              din1 = zero_ptr;
-            case 4:
-              din2 = zero_ptr;
-            case 3:
-              din3 = zero_ptr;
-            case 2:
-              din4 = zero_ptr;
-            case 1:
-              din5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 2 > h_out_new) {
-          dout1 = write_ptr;
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_c;
-            *dout_ptr1++ = bias_c;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-        }
-        //! mid loop
-        for (int i = 0; i < w_out_new; ++i) {
-          compute_one_out_without_extract(din_ptr0,
-                                          din_ptr1,
-                                          din_ptr2,
-                                          din_ptr3,
-                                          din_ptr4,
-                                          din_ptr5,
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          weights_c,
-                                          vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-        }
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din5;
-        din4 = din3 + w_in_new;
-        din5 = din4 + w_in_new;
-
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-  free(din_new);
-}
-
-void conv_depthwise_5x5s1_small_relu_impl(const float* din,
-                                          float* dout,
-                                          int num,
-                                          int ch_out,
-                                          int h_out,
-                                          int w_out,
-                                          int ch_in,
-                                          int h_in,
-                                          int w_in,
-                                          const float* weights,
-                                          const float* bias,
-                                          int pad,
-                                          bool flag_bias,
-                                          bool flag_relu,
-                                          ARMContext* ctx) {
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_in_new = h_in + 2 * pad_new;
-  int w_in_new = w_in + 2 * pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int w_out_new = w_out - 2 * pad_0;
-  float zero_ptr[w_in_new + w_out];  // NOLINT
-  memset(zero_ptr, 0, w_in_new * sizeof(float));
-  float* write_ptr = zero_ptr + w_in_new;
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in_new * h_in_new;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new);
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din_new + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float bias_relu = bias_c > 0.f ? bias_c : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_relu);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_relu;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      //! every h loop, deal with 6 line input
-      const float* din0 = din_ch;
-      const float* din1 = din0 + w_in_new;
-      const float* din2 = din1 + w_in_new;
-      const float* din3 = din2 + w_in_new;
-      const float* din4 = din3 + w_in_new;
-      const float* din5 = din4 + w_in_new;
-      //! every h loop, deal with 2 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 2) {
-        //! (h - pad_new) + 6 > h_in - 1
-        if (h + 6 > h_in_new) {
-          switch (h + 6 - h_in_new) {
-            case 5:
-              din1 = zero_ptr;
-            case 4:
-              din2 = zero_ptr;
-            case 3:
-              din3 = zero_ptr;
-            case 2:
-              din4 = zero_ptr;
-            case 1:
-              din5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 2 > h_out_new) {
-          dout1 = write_ptr;
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_relu;
-            *dout_ptr1++ = bias_relu;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-        }
-        //! mid loop
-        for (int i = 0; i < w_out_new; ++i) {
-          compute_one_out_without_extract_relu(din_ptr0,
-                                               din_ptr1,
-                                               din_ptr2,
-                                               din_ptr3,
-                                               din_ptr4,
-                                               din_ptr5,
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               weights_c,
-                                               vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-        }
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din5;
-        din4 = din3 + w_in_new;
-        din5 = din4 + w_in_new;
-
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
+              "ldr  q24,  [%[bias]]   \n" /* load bias to out00 */
+              "ld1  {v0.4s,  v1.4s,  v2.4s,  v3.4s},  [%[wc]],    #64 \n" /* load w0-w3 */
+              "ld1  {v8.4s,  v9.4s,  v10.4s, v11.4s}, [%[inr0]],  #64 \n" /* load inr0, 0-3 */
+              "1:\n"
+              "ld1  {v16.4s, v17.4s, v18.4s, v19.4s}, [%[inr1]],  #64 \n" /* load inr1, 0-3 */
+              "mov  v25.16b,  v24.16b  \n" /* mov bias to out01 */
+              "mov  v26.16b,  v24.16b  \n" /* mov bias to out02 */
+              "mov  v27.16b,  v24.16b  \n" /* mov bias to out03 */
+              "mov  v28.16b,  v24.16b  \n" /* mov bias to out10 */
+              "mov  v29.16b,  v24.16b  \n" /* mov bias to out11 */
+              "mov  v30.16b,  v24.16b  \n" /* mov bias to out12 */
+              "mov  v31.16b,  v24.16b  \n" /* mov bias to out13 */
+              //   out row0
+              "fmla v24.4s, v8.4s,  v0.4s  \n"  /* out00 = w0 * inr00 */
+              "fmla v25.4s, v9.4s,  v0.4s  \n"  /* out01 = w0 * inr01 */
+              "ldp  q12,  q13,  [%[inr0]]  \n"  /* load inr0, 4-5 */
+              "fmla v26.4s, v10.4s, v0.4s  \n"  /* out02 = w0 * inr02 */
+              "fmla v27.4s, v11.4s, v0.4s  \n"  /* out03 = w0 * inr03 */
+              "fmla v28.4s, v16.4s, v0.4s  \n"  /* out10 = w0 * inr10 */
+              "fmla v29.4s, v17.4s, v0.4s  \n"  /* out11 = w0 * inr11 */
+              "ldp  q20,  q21,  [%[inr1]]  \n"  /* load inr1, 4-5 */
+              "fmla v30.4s, v18.4s, v0.4s  \n"  /* out12 = w0 * inr12 */
+              "fmla v31.4s, v19.4s, v0.4s  \n"  /* out13 = w0 * inr13 */
+              "fmla v24.4s, v9.4s,  v1.4s  \n"  /* out00 = w1 * inr01 */
+              "fmla v25.4s, v10.4s, v1.4s  \n"  /* out01 = w1 * inr02 */
+              "fmla v26.4s, v11.4s, v1.4s  \n"  /* out02 = w1 * inr03 */
+              "fmla v27.4s, v12.4s, v1.4s  \n"  /* out03 = w1 * inr04 */
+              "ldp  q14,  q15,  [%[inr0], #32]  \n" /* load inr0, 6-7 */
+              "fmla v28.4s, v17.4s, v1.4s  \n"  /* out10 = w1 * inr11 */
+              "fmla v29.4s, v18.4s, v1.4s  \n"  /* out11 = w1 * inr12 */
+              "fmla v30.4s, v19.4s, v1.4s  \n"  /* out12 = w1 * inr13 */
+              "fmla v31.4s, v20.4s, v1.4s  \n"  /* out13 = w1 * inr14 */
+              "fmla v24.4s, v10.4s, v2.4s  \n"  /* out00 = w2 * inr02 */
+              "fmla v25.4s, v11.4s, v2.4s  \n"  /* out01 = w2 * inr03 */
+              "fmla v26.4s, v12.4s, v2.4s  \n"  /* out02 = w2 * inr04 */
+              "fmla v27.4s, v13.4s, v2.4s  \n"  /* out03 = w2 * inr05 */
+              "ldp  q22,  q23,  [%[inr1], #32]  \n" /* load inr1, 6-7 */
+              "fmla v28.4s, v18.4s, v2.4s  \n"  /* out10 = w2 * inr12 */
+              "fmla v29.4s, v19.4s, v2.4s  \n"  /* out11 = w2 * inr13 */
+              "fmla v30.4s, v20.4s, v2.4s  \n"  /* out12 = w2 * inr14 */
+              "fmla v31.4s, v21.4s, v2.4s  \n"  /* out13 = w2 * inr15 */
+              "ldp  q4, q5, [%[wc]],  #32  \n"  /* load w4-w5 */
+              "fmla v24.4s, v11.4s, v3.4s  \n"  /* out00 = w3 * inr03 */
+              "fmla v25.4s, v12.4s, v3.4s  \n"  /* out01 = w3 * inr04 */
+              "fmla v26.4s, v13.4s, v3.4s  \n"  /* out02 = w3 * inr05 */
+              "fmla v27.4s, v14.4s, v3.4s  \n"  /* out03 = w3 * inr06 */
+              "ldp  q6, q7, [%[wc]],  #32  \n"  /* load w6-w7 */
+              "fmla v28.4s, v19.4s, v3.4s  \n"  /* out10 = w3 * inr13 */
+              "fmla v29.4s, v20.4s, v3.4s  \n"  /* out11 = w3 * inr14 */
+              "fmla v30.4s, v21.4s, v3.4s  \n"  /* out12 = w3 * inr15 */
+              "fmla v31.4s, v22.4s, v3.4s  \n"  /* out13 = w3 * inr16 */
+              "fmla v24.4s, v12.4s, v4.4s  \n"  /* out00 = w4 * inr04 */
+              "fmla v25.4s, v13.4s, v4.4s  \n"  /* out01 = w4 * inr05 */
+              "fmla v26.4s, v14.4s, v4.4s  \n"  /* out02 = w4 * inr06 */
+              "fmla v27.4s, v15.4s, v4.4s  \n"  /* out03 = w4 * inr07 */
+              "ldp  q8, q9, [%[inr2]], #32 \n"  /* load inr2, 0-1 */
+              "fmla v28.4s, v20.4s, v4.4s  \n"  /* out10 = w4 * inr14 */
+              "fmla v29.4s, v21.4s, v4.4s  \n"  /* out11 = w4 * inr15 */
+              "fmla v30.4s, v22.4s, v4.4s  \n"  /* out12 = w4 * inr16 */
+              "fmla v31.4s, v23.4s, v4.4s  \n"  /* out13 = w4 * inr17 */
+              "ldp q10, q11, [%[inr2]], #32\n"  /* load inr2, 2-3 */
+              //   out row1
+              "fmla v24.4s, v16.4s, v5.4s  \n"  /* out00 = w5 * inr10 */
+              "fmla v25.4s, v17.4s, v5.4s  \n"  /* out01 = w5 * inr11 */
+              "fmla v26.4s, v18.4s, v5.4s  \n"  /* out02 = w5 * inr12 */
+              "fmla v27.4s, v19.4s, v5.4s  \n"  /* out03 = w5 * inr13 */
+              "ldp  q12,  q13,  [%[inr2]]  \n"  /* load inr2, 4-5 */
+              "fmla v28.4s, v8.4s,  v5.4s  \n"  /* out10 = w5 * inr20 */
+              "fmla v29.4s, v9.4s,  v5.4s  \n"  /* out11 = w5 * inr21 */
+              "fmla v30.4s, v10.4s, v5.4s  \n"  /* out12 = w5 * inr22 */
+              "fmla v31.4s, v11.4s, v5.4s  \n"  /* out13 = w5 * inr23 */
+              "fmla v24.4s, v17.4s, v6.4s  \n"  /* out00 = w6 * inr11 */
+              "fmla v25.4s, v18.4s, v6.4s  \n"  /* out01 = w6 * inr12 */
+              "fmla v26.4s, v19.4s, v6.4s  \n"  /* out02 = w6 * inr13 */
+              "fmla v27.4s, v20.4s, v6.4s  \n"  /* out03 = w6 * inr14 */
+              "ldp q14, q15, [%[inr2], #32]\n"  /* load inr2, 6-7 */
+              "fmla v28.4s, v9.4s,  v6.4s  \n"  /* out10 = w6 * inr21 */
+              "fmla v29.4s, v10.4s, v6.4s  \n"  /* out11 = w6 * inr22 */
+              "fmla v30.4s, v11.4s, v6.4s  \n"  /* out12 = w6 * inr23 */
+              "fmla v31.4s, v12.4s, v6.4s  \n"  /* out13 = w6 * inr24 */
+              "fmla v24.4s, v18.4s, v7.4s  \n"  /* out00 = w7 * inr12 */
+              "fmla v25.4s, v19.4s, v7.4s  \n"  /* out01 = w7 * inr13 */
+              "fmla v26.4s, v20.4s, v7.4s  \n"  /* out02 = w7 * inr14 */
+              "fmla v27.4s, v21.4s, v7.4s  \n"  /* out03 = w7 * inr15 */
+              "ldp  q0, q1, [%[wc]],  #32  \n"  /* load w8-w9 */
+              "fmla v28.4s, v10.4s, v7.4s  \n"  /* out10 = w7 * inr22 */
+              "fmla v29.4s, v11.4s, v7.4s  \n"  /* out11 = w7 * inr23 */
+              "fmla v30.4s, v12.4s, v7.4s  \n"  /* out12 = w7 * inr24 */
+              "fmla v31.4s, v13.4s, v7.4s  \n"  /* out13 = w7 * inr25 */
+              "fmla v24.4s, v19.4s, v0.4s  \n"  /* out00 = w8 * inr13 */
+              "fmla v25.4s, v20.4s, v0.4s  \n"  /* out01 = w8 * inr14 */
+              "fmla v26.4s, v21.4s, v0.4s  \n"  /* out02 = w8 * inr15 */
+              "fmla v27.4s, v22.4s, v0.4s  \n"  /* out03 = w8 * inr16 */
+              "ldp  q2, q3, [%[wc]],  #32  \n"  /* load w10-w11 */
+              "fmla v28.4s, v11.4s, v0.4s  \n"  /* out10 = w8 * inr23 */
+              "fmla v29.4s, v12.4s, v0.4s  \n"  /* out11 = w8 * inr24 */
+              "fmla v30.4s, v13.4s, v0.4s  \n"  /* out12 = w8 * inr25 */
+              "fmla v31.4s, v14.4s, v0.4s  \n"  /* out13 = w8 * inr26 */
+              "ldp q16, q17, [%[inr3]], #32\n"  /* load inr3, 0-1 */
+              "fmla v24.4s, v20.4s, v1.4s  \n"  /* out00 = w9 * inr14 */
+              "fmla v25.4s, v21.4s, v1.4s  \n"  /* out01 = w9 * inr15 */
+              "fmla v26.4s, v22.4s, v1.4s  \n"  /* out02 = w9 * inr16 */
+              "fmla v27.4s, v23.4s, v1.4s  \n"  /* out03 = w9 * inr17 */
+              "ldp q18, q19, [%[inr3]], #32\n"  /* load inr3, 2-3 */
+              "fmla v28.4s, v12.4s, v1.4s  \n"  /* out10 = w9 * inr24 */
+              "fmla v29.4s, v13.4s, v1.4s  \n"  /* out11 = w9 * inr25 */
+              "fmla v30.4s, v14.4s, v1.4s  \n"  /* out12 = w9 * inr26 */
+              "fmla v31.4s, v15.4s, v1.4s  \n"  /* out13 = w9 * inr27 */
+              //   out row2
+              "fmla v24.4s, v8.4s,  v2.4s  \n"  /* out00 = w10 * inr20 */
+              "fmla v25.4s, v9.4s,  v2.4s  \n"  /* out01 = w10 * inr21 */
+              "fmla v26.4s, v10.4s, v2.4s  \n"  /* out02 = w10 * inr22 */
+              "fmla v27.4s, v11.4s, v2.4s  \n"  /* out03 = w10 * inr23 */
+              "ldp  q4, q5, [%[wc]],  #32  \n"  /* load w12-w13 */
+              "fmla v28.4s, v16.4s, v2.4s  \n"  /* out10 = w10 * inr30 */
+              "fmla v29.4s, v17.4s, v2.4s  \n"  /* out11 = w10 * inr31 */
+              "fmla v30.4s, v18.4s, v2.4s  \n"  /* out12 = w10 * inr32 */
+              "fmla v31.4s, v19.4s, v2.4s  \n"  /* out13 = w10 * inr33 */
+              "ldp  q20,  q21,  [%[inr3]]  \n"  /* load inr3, 4-5 */
+              "fmla v24.4s, v9.4s,  v3.4s  \n"  /* out00 = w11 * inr21 */
+              "fmla v25.4s, v10.4s, v3.4s  \n"  /* out01 = w11 * inr22 */
+              "fmla v26.4s, v11.4s, v3.4s  \n"  /* out02 = w11 * inr23 */
+              "fmla v27.4s, v12.4s, v3.4s  \n"  /* out03 = w11 * inr24 */
+              "ldp q22, q23, [%[inr3], #32]\n"  /* load inr3, 6-7 */
+              "fmla v28.4s, v17.4s, v3.4s  \n"  /* out10 = w11 * inr31 */
+              "fmla v29.4s, v18.4s, v3.4s  \n"  /* out11 = w11 * inr32 */
+              "fmla v30.4s, v19.4s, v3.4s  \n"  /* out12 = w11 * inr33 */
+              "fmla v31.4s, v20.4s, v3.4s  \n"  /* out13 = w11 * inr34 */
+              "fmla v24.4s, v10.4s, v4.4s  \n"  /* out00 = w12 * inr22 */
+              "fmla v25.4s, v11.4s, v4.4s  \n"  /* out01 = w12 * inr23 */
+              "fmla v26.4s, v12.4s, v4.4s  \n"  /* out02 = w12 * inr24 */
+              "fmla v27.4s, v13.4s, v4.4s  \n"  /* out03 = w12 * inr25 */
+              "ldp  q6, q7, [%[wc]],  #32  \n"  /* load w14-w15 */
+              "fmla v28.4s, v18.4s, v4.4s  \n"  /* out10 = w12 * inr32 */
+              "fmla v29.4s, v19.4s, v4.4s  \n"  /* out11 = w12 * inr33 */
+              "fmla v30.4s, v20.4s, v4.4s  \n"  /* out12 = w12 * inr34 */
+              "fmla v31.4s, v21.4s, v4.4s  \n"  /* out13 = w12 * inr35 */
+              "fmla v24.4s, v11.4s, v5.4s  \n"  /* out00 = w13 * inr23 */
+              "fmla v25.4s, v12.4s, v5.4s  \n"  /* out01 = w13 * inr24 */
+              "fmla v26.4s, v13.4s, v5.4s  \n"  /* out02 = w13 * inr25 */
+              "fmla v27.4s, v14.4s, v5.4s  \n"  /* out03 = w13 * inr26 */
+              "ldp  q8, q9, [%[inr4]], #32 \n"  /* load inr4, 0-1 */
+              "fmla v28.4s, v19.4s, v5.4s  \n"  /* out10 = w13 * inr33 */
+              "fmla v29.4s, v20.4s, v5.4s  \n"  /* out11 = w13 * inr34 */
+              "fmla v30.4s, v21.4s, v5.4s  \n"  /* out12 = w13 * inr35 */
+              "fmla v31.4s, v22.4s, v5.4s  \n"  /* out13 = w13 * inr36 */
+              "fmla v24.4s, v12.4s, v6.4s  \n"  /* out00 = w14 * inr24 */
+              "fmla v25.4s, v13.4s, v6.4s  \n"  /* out01 = w14 * inr25 */
+              "fmla v26.4s, v14.4s, v6.4s  \n"  /* out02 = w14 * inr26 */
+              "fmla v27.4s, v15.4s, v6.4s  \n"  /* out03 = w14 * inr27 */
+              "ldp q10, q11, [%[inr4]], #32\n"  /* load inr4, 2-3 */
+              "fmla v28.4s, v20.4s, v6.4s  \n"  /* out10 = w14 * inr34 */
+              "fmla v29.4s, v21.4s, v6.4s  \n"  /* out11 = w14 * inr35 */
+              "fmla v30.4s, v22.4s, v6.4s  \n"  /* out12 = w14 * inr36 */
+              "fmla v31.4s, v23.4s, v6.4s  \n"  /* out13 = w14 * inr37 */
+              "ldp  q0, q1, [%[wc]],  #32  \n"  /* load w16-w17 */
+              //   out row3
+              "fmla v24.4s, v16.4s, v7.4s  \n"  /* out00 = w15 * inr30 */
+              "fmla v25.4s, v17.4s, v7.4s  \n"  /* out01 = w15 * inr31 */
+              "fmla v26.4s, v18.4s, v7.4s  \n"  /* out02 = w15 * inr32 */
+              "fmla v27.4s, v19.4s, v7.4s  \n"  /* out03 = w15 * inr33 */
+              "ldp  q12,  q13,  [%[inr4]]  \n"  /* load inr4, 4-5 */
+              "fmla v28.4s, v8.4s,  v7.4s  \n"  /* out10 = w15 * inr40 */
+              "fmla v29.4s, v9.4s,  v7.4s  \n"  /* out11 = w15 * inr41 */
+              "fmla v30.4s, v10.4s, v7.4s  \n"  /* out12 = w15 * inr42 */
+              "fmla v31.4s, v11.4s, v7.4s  \n"  /* out13 = w15 * inr42 */
+              "ldp  q2, q3, [%[wc]],  #32  \n"  /* load w18-w19 */
+              "fmla v24.4s, v17.4s, v0.4s  \n"  /* out00 = w16 * inr31 */
+              "fmla v25.4s, v18.4s, v0.4s  \n"  /* out01 = w16 * inr32 */
+              "fmla v26.4s, v19.4s, v0.4s  \n"  /* out02 = w16 * inr33 */
+              "fmla v27.4s, v20.4s, v0.4s  \n"  /* out03 = w16 * inr34 */
+              "ldp q14, q15, [%[inr4], #32]\n"  /* load inr4, 6-7 */
+              "fmla v28.4s, v9.4s,  v0.4s  \n"  /* out10 = w16 * inr41 */
+              "fmla v29.4s, v10.4s, v0.4s  \n"  /* out11 = w16 * inr42 */
+              "fmla v30.4s, v11.4s, v0.4s  \n"  /* out12 = w16 * inr43 */
+              "fmla v31.4s, v12.4s, v0.4s  \n"  /* out13 = w16 * inr44 */
+              "fmla v24.4s, v18.4s, v1.4s  \n"  /* out00 = w17 * inr32 */
+              "fmla v25.4s, v19.4s, v1.4s  \n"  /* out01 = w17 * inr33 */
+              "fmla v26.4s, v20.4s, v1.4s  \n"  /* out02 = w17 * inr34 */
+              "fmla v27.4s, v21.4s, v1.4s  \n"  /* out03 = w17 * inr35 */
+              "ldp  q4, q5, [%[wc]],  #32  \n"  /* load w20-w21 */
+              "fmla v28.4s, v10.4s, v1.4s  \n"  /* out10 = w17 * inr42 */
+              "fmla v29.4s, v11.4s, v1.4s  \n"  /* out11 = w17 * inr43 */
+              "fmla v30.4s, v12.4s, v1.4s  \n"  /* out12 = w17 * inr44 */
+              "fmla v31.4s, v13.4s, v1.4s  \n"  /* out13 = w17 * inr45 */
+              "fmla v24.4s, v19.4s, v2.4s  \n"  /* out00 = w18 * inr33 */
+              "fmla v25.4s, v20.4s, v2.4s  \n"  /* out01 = w18 * inr34 */
+              "fmla v26.4s, v21.4s, v2.4s  \n"  /* out02 = w18 * inr35 */
+              "fmla v27.4s, v22.4s, v2.4s  \n"  /* out03 = w18 * inr36 */
+              "ldp q16, q17, [%[inr5]], #32\n"  /* load inr5, 0-1 */
+              "fmla v28.4s, v11.4s, v2.4s  \n"  /* out10 = w18 * inr43 */
+              "fmla v29.4s, v12.4s, v2.4s  \n"  /* out11 = w18 * inr44 */
+              "fmla v30.4s, v13.4s, v2.4s  \n"  /* out12 = w18 * inr45 */
+              "fmla v31.4s, v14.4s, v2.4s  \n"  /* out13 = w18 * inr46 */
+              "fmla v24.4s, v20.4s, v3.4s  \n"  /* out00 = w19 * inr34 */
+              "fmla v25.4s, v21.4s, v3.4s  \n"  /* out01 = w19 * inr35 */
+              "fmla v26.4s, v22.4s, v3.4s  \n"  /* out02 = w19 * inr36 */
+              "fmla v27.4s, v23.4s, v3.4s  \n"  /* out03 = w19 * inr37 */
+              "ldp q18, q19, [%[inr5]], #32\n"  /* load inr5, 2-3 */
+              "fmla v28.4s, v12.4s, v3.4s  \n"  /* out10 = w19 * inr44 */
+              "fmla v29.4s, v13.4s, v3.4s  \n"  /* out11 = w19 * inr45 */
+              "fmla v30.4s, v14.4s, v3.4s  \n"  /* out12 = w19 * inr46 */
+              "fmla v31.4s, v15.4s, v3.4s  \n"  /* out13 = w19 * inr47 */
+              //   out row4
+              "fmla v24.4s, v8.4s,  v4.4s  \n"  /* out00 = w20 * inr40 */
+              "fmla v25.4s, v9.4s,  v4.4s  \n"  /* out01 = w20 * inr41 */
+              "fmla v26.4s, v10.4s, v4.4s  \n"  /* out02 = w20 * inr42 */
+              "fmla v27.4s, v11.4s, v4.4s  \n"  /* out03 = w20 * inr43 */
+              "ldp  q20,  q21,  [%[inr5]]  \n"  /* load inr5, 4-5 */
+              "fmla v28.4s, v16.4s, v4.4s  \n"  /* out10 = w20 * inr50 */
+              "fmla v29.4s, v17.4s, v4.4s  \n"  /* out11 = w20 * inr51 */
+              "fmla v30.4s, v18.4s, v4.4s  \n"  /* out12 = w20 * inr52 */
+              "fmla v31.4s, v19.4s, v4.4s  \n"  /* out13 = w20 * inr53 */
+              "ldp  q6, q7, [%[wc]],  #32  \n"  /* load w22-w23 */
+              "fmla v24.4s, v9.4s,  v5.4s  \n"  /* out00 = w21 * inr41 */
+              "fmla v25.4s, v10.4s, v5.4s  \n"  /* out01 = w21 * inr42 */
+              "fmla v26.4s, v11.4s, v5.4s  \n"  /* out02 = w21 * inr43 */
+              "fmla v27.4s, v12.4s, v5.4s  \n"  /* out03 = w21 * inr44 */
+              "ldp q22, q23, [%[inr5], #32]\n"  /* load inr5, 6-7 */
+              "fmla v28.4s, v17.4s, v5.4s  \n"  /* out10 = w21 * inr51 */
+              "fmla v29.4s, v18.4s, v5.4s  \n"  /* out11 = w21 * inr52 */
+              "fmla v30.4s, v19.4s, v5.4s  \n"  /* out12 = w21 * inr53 */
+              "fmla v31.4s, v20.4s, v5.4s  \n"  /* out13 = w21 * inr54 */
+              "ldp q8, q9, [%[inr0]], #32  \n"  /* load inr0, 0-1 */
+              "fmla v24.4s, v10.4s, v6.4s  \n"  /* out00 = w22 * inr42 */
+              "fmla v25.4s, v11.4s, v6.4s  \n"  /* out01 = w22 * inr43 */
+              "fmla v26.4s, v12.4s, v6.4s  \n"  /* out02 = w22 * inr44 */
+              "fmla v27.4s, v13.4s, v6.4s  \n"  /* out03 = w22 * inr45 */
+              "ldp q4, q5, [%[wc]], #-384  \n"  /* load w24 */
+              "fmla v28.4s, v18.4s, v6.4s  \n"  /* out10 = w22 * inr52 */
+              "fmla v29.4s, v19.4s, v6.4s  \n"  /* out11 = w22 * inr53 */
+              "fmla v30.4s, v20.4s, v6.4s  \n"  /* out12 = w22 * inr54 */
+              "fmla v31.4s, v21.4s, v6.4s  \n"  /* out13 = w22 * inr55 */
+              "ldp  q0, q1, [%[wc]],  #32  \n"  /* load w0-w1 */
+              "fmla v24.4s, v11.4s, v7.4s  \n"  /* out00 = w23 * inr43 */
+              "fmla v25.4s, v12.4s, v7.4s  \n"  /* out01 = w23 * inr44 */
+              "fmla v26.4s, v13.4s, v7.4s  \n"  /* out02 = w23 * inr45 */
+              "fmla v27.4s, v14.4s, v7.4s  \n"  /* out03 = w23 * inr46 */
+              "ldp  q2, q3, [%[wc]],  #32  \n"  /* load w1-w2 */
+              "fmla v28.4s, v19.4s, v7.4s  \n"  /* out10 = w23 * inr53 */
+              "fmla v29.4s, v20.4s, v7.4s  \n"  /* out11 = w23 * inr54 */
+              "fmla v30.4s, v21.4s, v7.4s  \n"  /* out12 = w23 * inr55 */
+              "fmla v31.4s, v22.4s, v7.4s  \n"  /* out13 = w23 * inr56 */
+              "ldp q10, q11, [%[inr0]], #32\n"  /* load inr0, 2-3 */
+              "fmla v24.4s, v12.4s, v4.4s  \n"  /* out00 = w24 * inr44 */
+              "fmla v25.4s, v13.4s, v4.4s  \n"  /* out01 = w24 * inr45 */
+              "fmla v26.4s, v14.4s, v4.4s  \n"  /* out02 = w24 * inr46 */
+              "fmla v27.4s, v15.4s, v4.4s  \n"  /* out03 = w24 * inr47 */
+              "stp q24, q25, [%[out0]], #32\n"  /* store outr0, 0-1 */
+              "fmla v28.4s, v20.4s, v4.4s  \n"  /* out10 = w24 * inr54 */
+              "fmla v29.4s, v21.4s, v4.4s  \n"  /* out11 = w24 * inr55 */
+              "stp q26, q27, [%[out0]], #32\n"  /* store outr0, 2-3 */
+              "fmla v30.4s, v22.4s, v4.4s  \n"  /* out12 = w24 * inr56 */
+              "fmla v31.4s, v23.4s, v4.4s  \n"  /* out13 = w24 * inr57 */
+              "ldr  q24,  [%[bias]]        \n"  /* load bias to out00 */
+              "subs   %w[cnt], %w[cnt], #1\n"   /*  cnt = cnt - 1   */
+              "stp q28, q29, [%[out1]], #32\n"  /* store outr1, 0-1 */
+              "stp q30, q31, [%[out1]], #32\n"  /* store outr1, 2-3 */
+              "bne    1b\n"
+              : [cnt] "+r"(cnt),
+                [inr0] "+r"(inr0),
+                [inr1] "+r"(inr1),
+                [inr2] "+r"(inr2),
+                [inr3] "+r"(inr3),
+                [inr4] "+r"(inr4),
+                [inr5] "+r"(inr5),
+                [wc] "+r"(wptr),
+                [out0] "+r"(ptr_out0),
+                [out1] "+r"(ptr_out1)
+              : [bias] "r"(bias_local)
+              : "cc","memory",
+                "v0","v1","v2","v3","v4","v5","v6","v7",
+                "v8","v9","v10","v11","v12","v13",
+                "v14","v15","v16","v17","v18","v19",
+                "v20","v21","v22","v23","v24","v25",
+                "v26","v27","v28","v29","v30","v31"
+              );
+          // clang-format on
+          block_inr0 = block_inr2;
+          block_inr1 = block_inr3;
+          block_inr2 = block_inr4;
+          block_inr3 = block_inr5;
+          block_inr4 = block_inr3 + in_len;
+          block_inr5 = block_inr4 + in_len;
+        }
+        write_to_output_c4_fp32(pre_out,
+                                dout_batch,
+                                c,
+                                c + hout_c_block,
+                                h,
+                                h + h_kernel,
+                                0,
+                                wout_round,
+                                chout,
+                                hout,
+                                wout,
+                                flag_relu,
+                                ptr_write,
+                                &act_param);
       }
     }
   }
-  free(din_new);
 }
-#endif  // __aarch64__
-
-void conv_depthwise_5x5s1_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int chout,
-                               int hout,
-                               int wout,
-                               int chin,
-                               int hin,
-                               int win,
+#else  // __aarch64__
+void conv_depthwise_5x5s1_fp32(float* dout,
+                               const float* din,
                                const float* weights,
                                const float* bias,
-                               int pad,
                                bool flag_bias,
                                bool flag_relu,
+                               int num,
+                               int chin,
+                               int hin,
+                               int win,
+                               int hout,
+                               int wout,
+                               int padw,
+                               int padh,
+                               const operators::ConvParam& param,
                                ARMContext* ctx) {
-  if (win < 4) {
-    if (flag_relu) {
-      conv_depthwise_5x5s1_small_relu_impl(din,
-                                           dout,
-                                           num,
-                                           chout,
-                                           hout,
-                                           wout,
-                                           chin,
-                                           hin,
-                                           win,
-                                           weights,
-                                           bias,
-                                           pad,
-                                           flag_bias,
-                                           flag_relu,
-                                           ctx);
-    } else {
-      conv_depthwise_5x5s1_small_impl(din,
-                                      dout,
-                                      num,
-                                      chout,
-                                      hout,
-                                      wout,
-                                      chin,
-                                      hin,
-                                      win,
-                                      weights,
-                                      bias,
-                                      pad,
-                                      flag_bias,
-                                      flag_relu,
-                                      ctx);
-    }
-  } else {
-    if (flag_relu) {
-      conv_depthwise_5x5s1_relu_impl(din,
-                                     dout,
-                                     num,
-                                     chout,
-                                     hout,
-                                     wout,
-                                     chin,
-                                     hin,
-                                     win,
-                                     weights,
-                                     bias,
-                                     pad,
-                                     flag_bias,
-                                     flag_relu,
-                                     ctx);
-    } else {
-      conv_depthwise_5x5s1_impl(din,
-                                dout,
-                                num,
+  const int threads = ctx->threads();
+  int llc_size = ctx->llc_size() / 4;
+  auto act_param = param.activation_param;
+  const int hout_c_block = 4;
+  const int hout_r_kernel = 1;
+  const int wout_block = 4;
+  const int wout_round = ((wout + wout_block - 1) / wout_block) * wout_block;
+  const int win_round = wout_round + 4;
+
+  //! get h block
+  //! llc_size = threads * win_round * hout_c_block * hin_r_block *
+  //! sizeof(float)
+  //! + wout_round * hout_c_block * hout_r_block * threads * sizeof(float)
+  //! win_round = wout_round + 4
+  //! hin_r_block = hout_r_block + 4
+  int hout_r_block = (llc_size - 16 * win_round * hout_c_block * threads) /
+                     (win_round * hout_c_block * threads * 4 +
+                      hout_c_block * wout_round * threads * 4);
+  hout_r_block = hout_r_block > hout ? hout : hout_r_block;
+  hout_r_block =
+      ((hout_r_block + hout_r_kernel - 1) / hout_r_kernel) * hout_r_kernel;
+  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
+
+  const int hin_r_block = hout_r_block + 4;
+
+  float* tmp_work_space = ctx->workspace_data<float>();
+  float ptr_zero[win_round];  // NOLINT
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float ptr_write[wout_round];  // NOLINT
+
+  int in_len = win_round * hout_c_block;
+  int pre_in_size = hin_r_block * in_len;
+  pre_in_size = ROUNDUP(pre_in_size, 4);
+  int pre_out_size = hout_c_block * hout_r_block * wout_round;
+
+  float* tmp_din = tmp_work_space;
+
+  int size_in_channel = win * hin;
+  int size_out_channel = wout * hout;
+  int w_stride = 25;  // kernel_w * kernel_h;
+
+  int ws = -padw;
+  int we = ws + win_round;
+  int w_loop = wout_round / 4;
+  int chout = chin;
+
+  int out_row_stride = hout_c_block * wout_round;
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * chin * size_in_channel;
+    float* dout_batch = dout + n * chout * size_out_channel;
+    for (int h = 0; h < hout; h += hout_r_block) {
+      int h_kernel = hout_r_block;
+      if (h + hout_r_block > hout) {
+        h_kernel = hout - h;
+      }
+      int hs = h - padh;
+      int he = hs + h_kernel + 4;
+
+#pragma omp parallel for num_threads(threads)
+      for (int c = 0; c < chout; c += hout_c_block) {
+#ifdef ARM_WITH_OMP
+        float* pre_din =
+            tmp_din + omp_get_thread_num() * (pre_in_size + pre_out_size);
+        float* pre_out = pre_din + pre_in_size;
+#else
+        float* pre_din = tmp_din;
+        float* pre_out = pre_din + pre_in_size;
+#endif
+        prepack_input_nxwc4_dw(
+            din_batch, pre_din, c, hs, he, ws, we, chin, win, hin, ptr_zero);
+        const float* block_inr0 = pre_din;
+        const float* block_inr1 = block_inr0 + in_len;
+        const float* block_inr2 = block_inr1 + in_len;
+        const float* block_inr3 = block_inr2 + in_len;
+        const float* block_inr4 = block_inr3 + in_len;
+
+        const float* weight_c = weights + c * w_stride;
+        float bias_local[4] = {0, 0, 0, 0};
+        if (flag_bias) {
+          bias_local[0] = bias[c];
+          bias_local[1] = bias[c + 1];
+          bias_local[2] = bias[c + 2];
+          bias_local[3] = bias[c + 3];
+        }
+        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
+          int cnt = w_loop;
+          const float* inr0 = block_inr0;
+          const float* inr1 = block_inr1;
+          const float* inr2 = block_inr2;
+          const float* inr3 = block_inr3;
+          const float* inr4 = block_inr4;
+
+          float* ptr_out0 = pre_out + hk * out_row_stride;
+          // clang-format off
+          auto wptr = weight_c;
+          asm volatile(
+              "vld1.32  {d24-d25},  [%[bias]]   \n" /* load bias to out00 */
+              "vld1.32  {d0-d3},    [%[wc]]!    \n" /* load w0-w1 */
+              "vld1.32  {d4-d7},    [%[wc]]!    \n" /* load w2-w3 */
+              "vld1.32  {d8-d11},   [%[inr0]]!  \n" /* load inr0, 0-1 */
+              "vld1.32  {d12-d15},  [%[inr0]]!  \n" /* load inr0, 2-3 */
+              "1:\n"
+              "vld1.32  {d16-d19},  [%[inr0]]!  \n" /* load inr0, 4-5 */
+              "vmov.u32 q13,  q12 \n" /* mov bias to out01 */
+              "vmov.u32 q14,  q12 \n" /* mov bias to out02 */
+              "vmov.u32 q15,  q12 \n" /* mov bias to out03 */
+              //  out row0
+              "vmla.f32 q12,  q4,   q0  \n"   /* out00 = w0 * inr00 */
+              "vmla.f32 q13,  q5,   q0  \n"   /* out01 = w0 * inr01 */
+              "vmla.f32 q14,  q6,   q0  \n"   /* out02 = w0 * inr02 */
+              "vmla.f32 q15,  q7,   q0  \n"   /* out03 = w0 * inr03 */
+              "vld1.32  {d20-d23},  [%[inr0]]!  \n" /* load inr0, 6-7 */
+              "sub    %[inr0], %[inr0], #64   \n" /* inr0 -= 64 */
+              "vmla.f32 q12,  q5,   q1  \n"   /* out00 = w1 * inr01 */
+              "vmla.f32 q13,  q6,   q1  \n"   /* out01 = w1 * inr02 */
+              "vmla.f32 q14,  q7,   q1  \n"   /* out02 = w1 * inr03 */
+              "vmla.f32 q15,  q8,   q1  \n"   /* out03 = w1 * inr04 */
+              "vld1.32  {d8-d11},   [%[inr1]]!\n" /* load inr1, 0-1 */
+              "vmla.f32 q12,  q6,   q2  \n"   /* out00 = w2 * inr02 */
+              "vmla.f32 q13,  q7,   q2  \n"   /* out01 = w2 * inr03 */
+              "vmla.f32 q14,  q8,   q2  \n"   /* out02 = w2 * inr04 */
+              "vmla.f32 q15,  q9,   q2  \n"   /* out03 = w2 * inr05 */
+              "vld1.32   {d0-d3},   [%[wc]]!  \n" /* load w4-w5 */
+              "vmla.f32 q12,  q7,   q3  \n"   /* out00 = w3 * inr03 */
+              "vmla.f32 q13,  q8,   q3  \n"   /* out01 = w3 * inr04 */
+              "vmla.f32 q14,  q9,   q3  \n"   /* out02 = w3 * inr05 */
+              "vmla.f32 q15,  q10,  q3  \n"   /* out03 = w3 * inr06 */
+              "vld1.32  {d12-d15},  [%[inr1]]!\n" /* load inr1, 2-3 */
+              "vmla.f32 q12,  q8,   q0  \n"   /* out00 = w4 * inr04 */
+              "vmla.f32 q13,  q9,   q0  \n"   /* out01 = w4 * inr05 */
+              "vmla.f32 q14,  q10,  q0  \n"   /* out02 = w4 * inr06 */
+              "vmla.f32 q15,  q11,  q0  \n"   /* out03 = w4 * inr07 */
+              "vld1.32   {d4-d7},   [%[wc]]!  \n" /* load w6-w7 */
+              //  out row1
+              "vmla.f32 q12,  q4,   q1  \n"   /* out00 = w5 * inr10 */
+              "vmla.f32 q13,  q5,   q1  \n"   /* out01 = w5 * inr11 */
+              "vmla.f32 q14,  q6,   q1  \n"   /* out02 = w5 * inr12 */
+              "vmla.f32 q15,  q7,   q1  \n"   /* out03 = w5 * inr13 */
+              "vld1.32  {d16-d19},  [%[inr1]]!\n" /* load inr1, 4-5 */
+              "vmla.f32 q12,  q5,   q2  \n"   /* out00 = w6 * inr11 */
+              "vmla.f32 q13,  q6,   q2  \n"   /* out01 = w6 * inr12 */
+              "vmla.f32 q14,  q7,   q2  \n"   /* out02 = w6 * inr13 */
+              "vmla.f32 q15,  q8,   q2  \n"   /* out03 = w6 * inr14 */
+              "vld1.32   {d0-d3},   [%[wc]]!  \n" /* load w8-w9 */
+              "vmla.f32 q12,  q6,   q3  \n"   /* out00 = w7 * inr12 */
+              "vmla.f32 q13,  q7,   q3  \n"   /* out01 = w7 * inr13 */
+              "vld1.32  {d20-d23},  [%[inr1]]!\n" /* load inr1, 6-7 */
+              "vmla.f32 q14,  q8,   q3  \n"   /* out02 = w7 * inr14 */
+              "vmla.f32 q15,  q9,   q3  \n"   /* out03 = w7 * inr15 */
+              "sub    %[inr1], %[inr1], #64   \n" /* inr1 -= 64 */
+              "vmla.f32 q12,  q7,   q0  \n"   /* out00 = w8 * inr13 */
+              "vmla.f32 q13,  q8,   q0  \n"   /* out01 = w8 * inr14 */
+              "vld1.32  {d8-d11},   [%[inr2]]!\n" /* load inr2, 0-1 */
+              "vmla.f32 q14,  q9,   q0  \n"   /* out02 = w8 * inr15 */
+              "vmla.f32 q15,  q10,  q0  \n"   /* out03 = w8 * inr16 */
+              "vld1.32  {d4-d7},    [%[wc]]!  \n" /* load w10-w11 */
+              "vmla.f32 q12,  q8,   q1  \n"   /* out00 = w9 * inr14 */
+              "vmla.f32 q13,  q9,   q1  \n"   /* out01 = w9 * inr15 */
+              "vld1.32  {d12-d15},  [%[inr2]]!\n" /* load inr2, 2-3 */
+              "vmla.f32 q14,  q10,  q1  \n"   /* out02 = w9 * inr16 */
+              "vmla.f32 q15,  q11,  q1  \n"   /* out03 = w9 * inr17 */
+              //  out row3
+              "vmla.f32 q12,  q4,   q2  \n"   /* out00 = w10 * inr20 */
+              "vmla.f32 q13,  q5,   q2  \n"   /* out01 = w10 * inr21 */
+              "vld1.32  {d16-d19},  [%[inr2]]!\n" /* load inr2, 4-5 */
+              "vmla.f32 q14,  q6,   q2  \n"   /* out02 = w10 * inr22 */
+              "vmla.f32 q15,  q7,   q2  \n"   /* out03 = w10 * inr23 */
+              "vld1.32  {d0-d3},    [%[wc]]!  \n" /* load w12-w13 */
+              "vmla.f32 q12,  q5,   q3  \n"   /* out00 = w11 * inr21 */
+              "vmla.f32 q13,  q6,   q3  \n"   /* out01 = w11 * inr22 */
+              "vld1.32  {d20-d23},  [%[inr2]]!\n" /* load inr2, 6-7 */
+              "vmla.f32 q14,  q7,   q3  \n"   /* out02 = w11 * inr23 */
+              "vmla.f32 q15,  q8,   q3  \n"   /* out03 = w11 * inr24 */
+              "vld1.32  {d4-d7},    [%[wc]]!  \n" /* load w14-w15 */
+              "sub    %[inr2], %[inr2], #64   \n" /* inr2 -= 64 */
+              "vmla.f32 q12,  q6,   q0  \n"   /* out00 = w12 * inr22 */
+              "vmla.f32 q13,  q7,   q0  \n"   /* out01 = w12 * inr23 */
+              "vmla.f32 q14,  q8,   q0  \n"   /* out02 = w12 * inr24 */
+              "vmla.f32 q15,  q9,   q0  \n"   /* out03 = w12 * inr25 */
+              "vld1.32  {d8-d11},   [%[inr3]]!\n" /* load inr3, 0-1 */
+              "vmla.f32 q12,  q7,   q1  \n"   /* out00 = w13 * inr23 */
+              "vmla.f32 q13,  q8,   q1  \n"   /* out01 = w13 * inr24 */
+              "vmla.f32 q14,  q9,   q1  \n"   /* out02 = w13 * inr25 */
+              "vmla.f32 q15,  q10,  q1  \n"   /* out03 = w13 * inr26 */
+              "vld1.32  {d0-d3},    [%[wc]]!  \n" /* load w16-w17 */
+              "vmla.f32 q12,  q8,   q2  \n"   /* out00 = w14 * inr24 */
+              "vmla.f32 q13,  q9,   q2  \n"   /* out01 = w14 * inr25 */
+              "vld1.32  {d12-d15},  [%[inr3]]!\n" /* load inr3, 2-3 */
+              "vmla.f32 q14,  q10,  q2  \n"   /* out02 = w14 * inr26 */
+              "vmla.f32 q15,  q11,  q2  \n"   /* out03 = w14 * inr27 */
+              //  out row3
+              "vmla.f32 q12,  q4,   q3  \n"   /* out00 = w15 * inr30 */
+              "vmla.f32 q13,  q5,   q3  \n"   /* out01 = w15 * inr31 */
+              "vld1.32  {d16-d19},  [%[inr3]]!\n" /* load inr3, 4-5 */
+              "vmla.f32 q14,  q6,   q3  \n"   /* out02 = w15 * inr32 */
+              "vmla.f32 q15,  q7,   q3  \n"   /* out03 = w15 * inr33 */
+              "vld1.32  {d4-d7},    [%[wc]]!  \n" /* load w18-w19 */
+              "vmla.f32 q12,  q5,   q0  \n"   /* out00 = w16 * inr31 */
+              "vmla.f32 q13,  q6,   q0  \n"   /* out01 = w16 * inr32 */
+              "vld1.32  {d20-d23},  [%[inr3]]!\n" /* load inr3, 6-7 */
+              "vmla.f32 q14,  q7,   q0  \n"   /* out02 = w16 * inr33 */
+              "vmla.f32 q15,  q8,   q0  \n"   /* out03 = w16 * inr34 */
+              "sub    %[inr3], %[inr3], #64   \n" /* inr3 -= 64 */
+              "vmla.f32 q12,  q6,   q1  \n"   /* out00 = w17 * inr32 */
+              "vmla.f32 q13,  q7,   q1  \n"   /* out01 = w17 * inr33 */
+              "vmla.f32 q14,  q8,   q1  \n"   /* out02 = w17 * inr34 */
+              "vmla.f32 q15,  q9,   q1  \n"   /* out03 = w17 * inr35 */
+              "vld1.32  {d0-d3},    [%[wc]]!  \n" /* load w20-w21 */
+              "vmla.f32 q12,  q7,   q2  \n"   /* out00 = w18 * inr33 */
+              "vmla.f32 q13,  q8,   q2  \n"   /* out01 = w18 * inr34 */
+              "vmla.f32 q14,  q9,   q2  \n"   /* out02 = w18 * inr35 */
+              "vmla.f32 q15,  q10,  q2  \n"   /* out03 = w18 * inr36 */
+              "vld1.32  {d8-d11},  [%[inr4]]!\n" /* load inr4, 0-1 */
+              "vmla.f32 q12,  q8,   q3  \n"   /* out00 = w19 * inr34 */
+              "vmla.f32 q13,  q9,   q3  \n"   /* out01 = w19 * inr35 */
+              "vld1.32  {d12-d15},  [%[inr4]]!\n" /* load inr4, 2-3 */
+              "vmla.f32 q14,  q10,  q3  \n"   /* out02 = w19 * inr36 */
+              "vmla.f32 q15,  q11,  q3  \n"   /* out03 = w19 * inr37 */
+              //  out row4
+              "vmla.f32 q12,  q4,   q0  \n"   /* out00 = w20 * inr40 */
+              "vmla.f32 q13,  q5,   q0  \n"   /* out01 = w20 * inr41 */
+              "vld1.32  {d16-d19},  [%[inr4]]!\n" /* load inr4, 4-5 */
+              "vmla.f32 q14,  q6,   q0  \n"   /* out02 = w20 * inr42 */
+              "vmla.f32 q15,  q7,   q0  \n"   /* out03 = w20 * inr43 */
+              "vld1.32  {d4-d7},    [%[wc]]!  \n" /* load w22-w23 */
+              "vmla.f32 q12,  q5,   q1  \n"   /* out00 = w21 * inr41 */
+              "vmla.f32 q13,  q6,   q1  \n"   /* out01 = w21 * inr42 */
+              "vmla.f32 q14,  q7,   q1  \n"   /* out02 = w21 * inr43 */
+              "vmla.f32 q15,  q8,   q1  \n"   /* out03 = w21 * inr44 */
+              "vld1.32  {d20-d23},  [%[inr4]]!\n" /* load inr4, 6-7 */
+              "vmla.f32 q12,  q6,   q2  \n"   /* out00 = w22 * inr42 */
+              "vmla.f32 q13,  q7,   q2  \n"   /* out01 = w22 * inr43 */
+              "vmla.f32 q14,  q8,   q2  \n"   /* out02 = w22 * inr44 */
+              "vmla.f32 q15,  q9,   q2  \n"   /* out03 = w22 * inr45 */
+              "vld1.32  {d4-d5},    [%[wc]]   \n" /* load w24 */
+              "sub    %[inr4], %[inr4], #64   \n" /* inr4 -= 64 */
+              "vmla.f32 q12,  q7,   q3  \n"   /* out00 = w23 * inr43 */
+              "vmla.f32 q13,  q8,   q3  \n"   /* out01 = w23 * inr44 */
+              "vld1.32  {d8-d11},   [%[inr0]]!\n" /* load inr0, 0-1 */
+              "sub  %[wc],  %[wc], #384 \n"   /* wptr = wptr - 384 */
+              "vmla.f32 q14,  q9,   q3  \n"   /* out02 = w23 * inr45 */
+              "vmla.f32 q15,  q10,  q3  \n"   /* out03 = w23 * inr46 */
+              "vld1.32  {d0-d3},    [%[wc]]!  \n" /* load w0-w1 */
+              "vmla.f32 q12,  q8,   q2  \n"   /* out00 = w24 * inr44 */
+              "vmla.f32 q13,  q9,   q2  \n"   /* out01 = w24 * inr45 */
+              "vld1.32  {d12-d15},  [%[inr0]]!\n" /* load inr0, 2-3 */
+              "vmla.f32 q14,  q10,  q2  \n"   /* out02 = w24 * inr46 */
+              "vmla.f32 q15,  q11,  q2  \n"   /* out03 = w24 * inr47 */
+              "vst1.32  {d24-d27},  [%[out0]]!\n" /* store out00, out01 */
+              "vld1.32  {d4-d7},    [%[wc]]!  \n" /* load w2-w3 */
+              "subs     %[cnt],   %[cnt], #1  \n" /* cnt = cnt - 1 */
+              "vst1.32  {d28-d31},  [%[out0]]!\n" /* store out02, out03 */
+              "vld1.32  {d24-d25},  [%[bias]] \n" /* load bias to out00 */
+              "bne    1b\n"
+              : [cnt] "+r"(cnt),
+                [inr0] "+r"(inr0),
+                [inr1] "+r"(inr1),
+                [inr2] "+r"(inr2),
+                [inr3] "+r"(inr3),
+                [inr4] "+r"(inr4),
+                [wc] "+r"(wptr),
+                [out0] "+r"(ptr_out0)
+              : [bias] "r"(bias_local)
+              : "cc","memory",
+                "q0", "q1", "q2", "q3", "q4", "q5",
+                "q6", "q7", "q8", "q9", "q10", "q11",
+                "q12", "q13", "q14", "q15"
+              );
+          // clang-format on
+          block_inr0 = block_inr1;
+          block_inr1 = block_inr2;
+          block_inr2 = block_inr3;
+          block_inr3 = block_inr4;
+          block_inr4 = block_inr3 + in_len;
+        }
+        write_to_output_c4_fp32(pre_out,
+                                dout_batch,
+                                c,
+                                c + hout_c_block,
+                                h,
+                                h + h_kernel,
+                                0,
+                                wout_round,
                                 chout,
                                 hout,
                                 wout,
-                                chin,
-                                hin,
-                                win,
-                                weights,
-                                bias,
-                                pad,
-                                flag_bias,
                                 flag_relu,
-                                ctx);
+                                ptr_write,
+                                &act_param);
+      }
     }
   }
 }
-
+#endif  // __aarch64__
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/conv5x5s1_depthwise_int8.cc b/lite/backends/arm/math/conv5x5s1_depthwise_int8.cc
index 802082048c86beeeecfe64a0de09880b1b9b0137..ed3dad300804dc90fac874999ac5d0a420cff4a4 100644
--- a/lite/backends/arm/math/conv5x5s1_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv5x5s1_depthwise_int8.cc
@@ -709,7 +709,6 @@ void conv_depthwise_5x5s1_int8(Dtype* dout,
                 "q15");
 #endif
           // clang-format on
-          int32_t* ptr_tmp = ptr_out0 - w_loop * 32;
           block_inr0 = block_inr1;
           block_inr1 = block_inr2;
           block_inr2 = block_inr3;
diff --git a/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc b/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
index dced24db72f71630c0cb9d7ff4275f740a2b69a4..a72b7553e0c8fddcb9028b0e6125281a07e65387 100644
--- a/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
@@ -13,3732 +13,932 @@
 // limitations under the License.
 
 #include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
 #include "lite/backends/arm/math/conv_depthwise.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
 
 namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
+#ifdef __aarch64__
+#define COMPUTE                                                    \
+  "ldp    q0, q1,   [%[inr0]], #32\n" /* load r0, 0-1 */           \
+  "and    v19.16b,  %[vbias].16b, %[vbias].16b\n"                  \
+  "ldp    q2, q3,   [%[inr0]], #32\n" /* load r0, 2-3 */           \
+  "and    v20.16b,  %[vbias].16b, %[vbias].16b\n"                  \
+  "ldp    q4, q5,   [%[inr0]], #32\n" /* load r0, 4-5 */           \
+  "and    v21.16b,  %[vbias].16b, %[vbias].16b\n"                  \
+  "ldp    q6, q7,   [%[inr0]], #32\n" /* load r0, 6-7 */           \
+  "and    v22.16b,  %[vbias].16b, %[vbias].16b\n"                  \
+  "ldp    q8, q9,   [%[inr0]], #32\n"    /* load r0, 8-9 */        \
+  "fmla   v19.4s ,  %[w0].4s,  v0.4s\n"  /* outr0 = w0 * r0, 0*/   \
+  "fmla   v20.4s ,  %[w0].4s,  v2.4s\n"  /* outr1 = w0 * r0, 2*/   \
+  "fmla   v21.4s ,  %[w0].4s,  v4.4s\n"  /* outr2 = w0 * r0, 4*/   \
+  "fmla   v22.4s ,  %[w0].4s,  v6.4s\n"  /* outr3 = w0 * r0, 6*/   \
+  "ldr    q10,   [%[inr0]] \n"           /* load r0, 10 */         \
+  "fmla   v19.4s ,  %[w1].4s,  v1.4s\n"  /* outr0 = w1 * r0, 1*/   \
+  "fmla   v20.4s ,  %[w1].4s,  v3.4s\n"  /* outr1 = w1 * r0, 3*/   \
+  "fmla   v21.4s ,  %[w1].4s,  v5.4s\n"  /* outr2 = w1 * r0, 5*/   \
+  "fmla   v22.4s ,  %[w1].4s,  v7.4s\n"  /* outr3 = w1 * r0, 7*/   \
+  "sub    %[inr0], %[inr0], #32\n"       /* inr0 -= 32 */          \
+  "ldp    q0, q1,   [%[inr1]], #32\n"    /* load r1, 0-1 */        \
+  "fmla   v19.4s ,  %[w2].4s,  v2.4s\n"  /* outr0 = w0 * r0, 2*/   \
+  "fmla   v20.4s ,  %[w2].4s,  v4.4s\n"  /* outr1 = w0 * r0, 4*/   \
+  "fmla   v21.4s ,  %[w2].4s,  v6.4s\n"  /* outr2 = w0 * r0, 6*/   \
+  "fmla   v22.4s ,  %[w2].4s,  v8.4s\n"  /* outr3 = w0 * r0, 8*/   \
+  "ldp    q14, q15, [%[wc0]], #32\n"     /* load w0-1, to q14-15*/ \
+  "fmla   v19.4s ,  %[w3].4s,  v3.4s\n"  /* outr0 = w3 * r1, 0*/   \
+  "fmla   v20.4s ,  %[w3].4s,  v5.4s\n"  /* outr1 = w3 * r1, 2*/   \
+  "fmla   v21.4s ,  %[w3].4s,  v7.4s\n"  /* outr2 = w3 * r1, 4*/   \
+  "fmla   v22.4s ,  %[w3].4s,  v9.4s\n"  /* outr3 = w3 * r1, 6*/   \
+  "ldp    q16, q17, [%[wc0]], #32\n"     /* load w2-3, to q16-17*/ \
+  "ldp    q2, q3,   [%[inr1]], #32\n"    /* load r1, 2-3 */        \
+  "fmla   v19.4s ,  %[w4].4s,  v4.4s\n"  /* outr0 = w3 * r1, 0*/   \
+  "fmla   v20.4s ,  %[w4].4s,  v6.4s\n"  /* outr1 = w3 * r1, 2*/   \
+  "fmla   v21.4s ,  %[w4].4s,  v8.4s\n"  /* outr2 = w3 * r1, 4*/   \
+  "fmla   v22.4s ,  %[w4].4s,  v10.4s\n" /* outr3 = w3 * r1, 6*/   \
+  "ldp    q4, q5,   [%[inr1]], #32\n"    /* load r1, 4-5 */        \
+  "ldr    q18,  [%[wc0]], #16\n"         /* load w4, to q18*/      \
+  "ldp    q6, q7,   [%[inr1]], #32\n"    /* load r0, 6-7 */        \
+  "fmla   v19.4s ,  v14.4s,  v0.4s\n"    /* outr0 = w0 * r0, 0*/   \
+  "fmla   v20.4s ,  v14.4s,  v2.4s\n"    /* outr1 = w0 * r0, 2*/   \
+  "fmla   v21.4s ,  v14.4s,  v4.4s\n"    /* outr2 = w0 * r0, 4*/   \
+  "fmla   v22.4s ,  v14.4s,  v6.4s\n"    /* outr3 = w0 * r0, 6*/   \
+  "ldp    q8, q9,   [%[inr1]], #32\n"    /* load r0, 8-9 */        \
+  "fmla   v19.4s ,  v15.4s,  v1.4s\n"    /* outr0 = w1 * r0, 1*/   \
+  "fmla   v20.4s ,  v15.4s,  v3.4s\n"    /* outr1 = w1 * r0, 3*/   \
+  "fmla   v21.4s ,  v15.4s,  v5.4s\n"    /* outr2 = w1 * r0, 5*/   \
+  "fmla   v22.4s ,  v15.4s,  v7.4s\n"    /* outr3 = w1 * r0, 7*/   \
+  "ldr    q10,   [%[inr1]] \n"           /* load r0, 10 */         \
+  "fmla   v19.4s ,  v16.4s,  v2.4s\n"    /* outr0 = w0 * r0, 2*/   \
+  "fmla   v20.4s ,  v16.4s,  v4.4s\n"    /* outr1 = w0 * r0, 4*/   \
+  "fmla   v21.4s ,  v16.4s,  v6.4s\n"    /* outr2 = w0 * r0, 6*/   \
+  "fmla   v22.4s ,  v16.4s,  v8.4s\n"    /* outr3 = w0 * r0, 8*/   \
+  "sub    %[inr1], %[inr1], #32\n"       /* inr1 -= 32 */          \
+  "ldp    q0, q1,   [%[inr2]], #32\n"    /* load r1, 0-1 */        \
+  "ldp    q14, q15, [%[wc0]], #32\n"     /* load w0-1, to q14-15*/ \
+  "fmla   v19.4s ,  v17.4s,  v3.4s\n"    /* outr0 = w3 * r1, 0*/   \
+  "fmla   v20.4s ,  v17.4s,  v5.4s\n"    /* outr1 = w3 * r1, 2*/   \
+  "fmla   v21.4s ,  v17.4s,  v7.4s\n"    /* outr2 = w3 * r1, 4*/   \
+  "fmla   v22.4s ,  v17.4s,  v9.4s\n"    /* outr3 = w3 * r1, 6*/   \
+  "ldp    q16, q17, [%[wc0]], #32\n"     /* load w2-3, to q16-17*/ \
+  "ldp    q2, q3,   [%[inr2]], #32\n"    /* load r1, 2-3 */        \
+  "fmla   v19.4s ,  v18.4s,  v4.4s\n"    /* outr0 = w3 * r1, 0*/   \
+  "fmla   v20.4s ,  v18.4s,  v6.4s\n"    /* outr1 = w3 * r1, 2*/   \
+  "fmla   v21.4s ,  v18.4s,  v8.4s\n"    /* outr2 = w3 * r1, 4*/   \
+  "fmla   v22.4s ,  v18.4s,  v10.4s\n"   /* outr3 = w3 * r1, 6*/   \
+  "ldp    q4, q5,   [%[inr2]], #32\n"    /* load r1, 4-5 */        \
+  "ldr    q18,  [%[wc0]], #16\n"         /* load w4, to q18*/      \
+  "ldp    q6, q7,   [%[inr2]], #32\n"    /* load r0, 6-7 */        \
+  "fmla   v19.4s ,  v14.4s,  v0.4s\n"    /* outr0 = w0 * r0, 0*/   \
+  "fmla   v20.4s ,  v14.4s,  v2.4s\n"    /* outr1 = w0 * r0, 2*/   \
+  "fmla   v21.4s ,  v14.4s,  v4.4s\n"    /* outr2 = w0 * r0, 4*/   \
+  "fmla   v22.4s ,  v14.4s,  v6.4s\n"    /* outr3 = w0 * r0, 6*/   \
+  "ldp    q8, q9,   [%[inr2]], #32\n"    /* load r0, 8-9 */        \
+  "fmla   v19.4s ,  v15.4s,  v1.4s\n"    /* outr0 = w1 * r0, 1*/   \
+  "fmla   v20.4s ,  v15.4s,  v3.4s\n"    /* outr1 = w1 * r0, 3*/   \
+  "fmla   v21.4s ,  v15.4s,  v5.4s\n"    /* outr2 = w1 * r0, 5*/   \
+  "fmla   v22.4s ,  v15.4s,  v7.4s\n"    /* outr3 = w1 * r0, 7*/   \
+  "ldr    q10,   [%[inr2]] \n"           /* load r0, 10 */         \
+  "fmla   v19.4s ,  v16.4s,  v2.4s\n"    /* outr0 = w0 * r0, 2*/   \
+  "fmla   v20.4s ,  v16.4s,  v4.4s\n"    /* outr1 = w0 * r0, 4*/   \
+  "fmla   v21.4s ,  v16.4s,  v6.4s\n"    /* outr2 = w0 * r0, 6*/   \
+  "fmla   v22.4s ,  v16.4s,  v8.4s\n"    /* outr3 = w0 * r0, 8*/   \
+  "sub    %[inr2], %[inr2], #32\n"       /* inr0 -= 32 */          \
+  "ldp    q0, q1,   [%[inr3]], #32\n"    /* load r1, 0-1 */        \
+  "ldp    q14, q15, [%[wc0]], #32\n"     /* load w0-1, to q14-15*/ \
+  "fmla   v19.4s ,  v17.4s,  v3.4s\n"    /* outr0 = w3 * r1, 0*/   \
+  "fmla   v20.4s ,  v17.4s,  v5.4s\n"    /* outr1 = w3 * r1, 2*/   \
+  "fmla   v21.4s ,  v17.4s,  v7.4s\n"    /* outr2 = w3 * r1, 4*/   \
+  "fmla   v22.4s ,  v17.4s,  v9.4s\n"    /* outr3 = w3 * r1, 6*/   \
+  "ldp    q16, q17, [%[wc0]], #32\n"     /* load w2-3, to q16-17*/ \
+  "ldp    q2, q3,   [%[inr3]], #32\n"    /* load r1, 2-3 */        \
+  "fmla   v19.4s ,  v18.4s,  v4.4s\n"    /* outr0 = w3 * r1, 0*/   \
+  "fmla   v20.4s ,  v18.4s,  v6.4s\n"    /* outr1 = w3 * r1, 2*/   \
+  "fmla   v21.4s ,  v18.4s,  v8.4s\n"    /* outr2 = w3 * r1, 4*/   \
+  "fmla   v22.4s ,  v18.4s,  v10.4s\n"   /* outr3 = w3 * r1, 6*/   \
+  "ldp    q4, q5,   [%[inr3]], #32\n"    /* load r1, 4-5 */        \
+  "ldr    q18,  [%[wc0]], #16\n"         /* load w4, to q18*/      \
+  "ldp    q6, q7,   [%[inr3]], #32\n"    /* load r0, 6-7 */        \
+  "fmla   v19.4s ,  v14.4s,  v0.4s\n"    /* outr0 = w0 * r0, 0*/   \
+  "fmla   v20.4s ,  v14.4s,  v2.4s\n"    /* outr1 = w0 * r0, 2*/   \
+  "fmla   v21.4s ,  v14.4s,  v4.4s\n"    /* outr2 = w0 * r0, 4*/   \
+  "fmla   v22.4s ,  v14.4s,  v6.4s\n"    /* outr3 = w0 * r0, 6*/   \
+  "ldp    q8, q9,   [%[inr3]], #32\n"    /* load r0, 8-9 */        \
+  "fmla   v19.4s ,  v15.4s,  v1.4s\n"    /* outr0 = w1 * r0, 1*/   \
+  "fmla   v20.4s ,  v15.4s,  v3.4s\n"    /* outr1 = w1 * r0, 3*/   \
+  "fmla   v21.4s ,  v15.4s,  v5.4s\n"    /* outr2 = w1 * r0, 5*/   \
+  "fmla   v22.4s ,  v15.4s,  v7.4s\n"    /* outr3 = w1 * r0, 7*/   \
+  "ldr    q10,   [%[inr3]] \n"           /* load r0, 10 */         \
+  "fmla   v19.4s ,  v16.4s,  v2.4s\n"    /* outr0 = w0 * r0, 2*/   \
+  "fmla   v20.4s ,  v16.4s,  v4.4s\n"    /* outr1 = w0 * r0, 4*/   \
+  "fmla   v21.4s ,  v16.4s,  v6.4s\n"    /* outr2 = w0 * r0, 6*/   \
+  "fmla   v22.4s ,  v16.4s,  v8.4s\n"    /* outr3 = w0 * r0, 8*/   \
+  "sub    %[inr3], %[inr3], #32\n"       /* inr0 -= 32 */          \
+  "ldp    q0, q1,   [%[inr4]], #32\n"    /* load r1, 0-1 */        \
+  "ldp    q14, q15, [%[wc0]], #32\n"     /* load w0-1, to q14-15*/ \
+  "fmla   v19.4s ,  v17.4s,  v3.4s\n"    /* outr0 = w3 * r1, 0*/   \
+  "fmla   v20.4s ,  v17.4s,  v5.4s\n"    /* outr1 = w3 * r1, 2*/   \
+  "fmla   v21.4s ,  v17.4s,  v7.4s\n"    /* outr2 = w3 * r1, 4*/   \
+  "fmla   v22.4s ,  v17.4s,  v9.4s\n"    /* outr3 = w3 * r1, 6*/   \
+  "ldp    q16, q17, [%[wc0]], #32\n"     /* load w2-3, to q16-17*/ \
+  "ldp    q2, q3,   [%[inr4]], #32\n"    /* load r1, 2-3 */        \
+  "fmla   v19.4s ,  v18.4s,  v4.4s\n"    /* outr0 = w3 * r1, 0*/   \
+  "fmla   v20.4s ,  v18.4s,  v6.4s\n"    /* outr1 = w3 * r1, 2*/   \
+  "fmla   v21.4s ,  v18.4s,  v8.4s\n"    /* outr2 = w3 * r1, 4*/   \
+  "fmla   v22.4s ,  v18.4s,  v10.4s\n"   /* outr3 = w3 * r1, 6*/   \
+  "ldp    q4, q5,   [%[inr4]], #32\n"    /* load r1, 4-5 */        \
+  "ldr    q18,  [%[wc0]], #16\n"         /* load w4, to q18*/      \
+  "ldp    q6, q7,   [%[inr4]], #32\n"    /* load r0, 6-7 */        \
+  "fmla   v19.4s ,  v14.4s,  v0.4s\n"    /* outr0 = w0 * r0, 0*/   \
+  "fmla   v20.4s ,  v14.4s,  v2.4s\n"    /* outr1 = w0 * r0, 2*/   \
+  "fmla   v21.4s ,  v14.4s,  v4.4s\n"    /* outr2 = w0 * r0, 4*/   \
+  "fmla   v22.4s ,  v14.4s,  v6.4s\n"    /* outr3 = w0 * r0, 6*/   \
+  "ldp    q8, q9,   [%[inr4]], #32\n"    /* load r0, 8-9 */        \
+  "fmla   v19.4s ,  v15.4s,  v1.4s\n"    /* outr0 = w1 * r0, 1*/   \
+  "fmla   v20.4s ,  v15.4s,  v3.4s\n"    /* outr1 = w1 * r0, 3*/   \
+  "fmla   v21.4s ,  v15.4s,  v5.4s\n"    /* outr2 = w1 * r0, 5*/   \
+  "fmla   v22.4s ,  v15.4s,  v7.4s\n"    /* outr3 = w1 * r0, 7*/   \
+  "ldr    q10,   [%[inr4]] \n"           /* load r0, 10 */         \
+  "fmla   v19.4s ,  v16.4s,  v2.4s\n"    /* outr0 = w0 * r0, 2*/   \
+  "fmla   v20.4s ,  v16.4s,  v4.4s\n"    /* outr1 = w0 * r0, 4*/   \
+  "fmla   v21.4s ,  v16.4s,  v6.4s\n"    /* outr2 = w0 * r0, 6*/   \
+  "fmla   v22.4s ,  v16.4s,  v8.4s\n"    /* outr3 = w0 * r0, 8*/   \
+  "sub    %[inr4], %[inr4], #32\n"       /* inr0 -= 32 */          \
+  "fmla   v19.4s ,  v17.4s,  v3.4s\n"    /* outr0 = w3 * r1, 0*/   \
+  "fmla   v20.4s ,  v17.4s,  v5.4s\n"    /* outr1 = w3 * r1, 2*/   \
+  "fmla   v21.4s ,  v17.4s,  v7.4s\n"    /* outr2 = w3 * r1, 4*/   \
+  "fmla   v22.4s ,  v17.4s,  v9.4s\n"    /* outr3 = w3 * r1, 6*/   \
+  "fmla   v19.4s ,  v18.4s,  v4.4s\n"    /* outr0 = w3 * r1, 0*/   \
+  "fmla   v20.4s ,  v18.4s,  v6.4s\n"    /* outr1 = w3 * r1, 2*/   \
+  "fmla   v21.4s ,  v18.4s,  v8.4s\n"    /* outr2 = w3 * r1, 4*/   \
+  "fmla   v22.4s ,  v18.4s,  v10.4s\n"   /* outr3 = w3 * r1, 6*/   \
+  "sub    %[wc0], %[wc0], #320\n"        /* weight -= 320 */       \
+  "trn1 v0.4s, v19.4s, v20.4s\n"         /* r0: a0a1c0c1*/         \
+  "trn2 v1.4s, v19.4s, v20.4s\n"         /* r0: b0b1d0d1*/         \
+  "trn1 v2.4s, v21.4s, v22.4s\n"         /* r0: a2a3c2c3*/         \
+  "trn2 v3.4s, v21.4s, v22.4s\n"         /* r0: b2b3d2d3*/         \
+  "trn1 v19.2d, v0.2d, v2.2d\n"          /* r0: a0a1a2a3*/         \
+  "trn2 v21.2d, v0.2d, v2.2d\n"          /* r0: c0c1c2c3*/         \
+  "trn1 v20.2d, v1.2d, v3.2d\n"          /* r0: b0b1b2b3*/         \
+  "trn2 v22.2d, v1.2d, v3.2d\n"          /* r0: d0d1d2d3*/
+#define RELU                             /* relu */     \
+  "movi v0.4s, #0\n"                     /* for relu */ \
+  "fmax v19.4s, v19.4s, v0.4s\n"                        \
+  "fmax v20.4s, v20.4s, v0.4s\n"                        \
+  "fmax v21.4s, v21.4s, v0.4s\n"                        \
+  "fmax v22.4s, v22.4s, v0.4s\n"
+#define RELU6 /* relu6 */             \
+  "fmin v19.4s, v19.4s, %[vsix].4s\n" \
+  "fmin v20.4s, v20.4s, %[vsix].4s\n" \
+  "fmin v21.4s, v21.4s, %[vsix].4s\n" \
+  "fmin v22.4s, v22.4s, %[vsix].4s\n"
+#define LEAKY_RELU                       /* LeakyRelu */ \
+  "movi v0.4s, #0\n"                     /* for relu */  \
+  "fcmge v1.4s, v19.4s,  v0.4s \n"       /* vcgeq_f32 */ \
+  "fmul  v2.4s, v19.4s, %[vscale].4s \n" /* mul */       \
+  "fcmge v3.4s, v20.4s,  v0.4s \n"       /* vcgeq_f32 */ \
+  "fmul  v4.4s, v20.4s, %[vscale].4s \n" /* mul */       \
+  "fcmge v5.4s, v21.4s,  v0.4s \n"       /* vcgeq_f32 */ \
+  "fmul  v6.4s, v21.4s, %[vscale].4s \n" /* mul */       \
+  "fcmge v7.4s, v22.4s,  v0.4s \n"       /* vcgeq_f32 */ \
+  "fmul  v8.4s, v22.4s, %[vscale].4s \n" /* mul */       \
+  "bif  v19.16b, v2.16b, v1.16b \n"      /* choose*/     \
+  "bif  v20.16b, v4.16b, v3.16b \n"      /* choose*/     \
+  "bif  v21.16b, v6.16b, v5.16b \n"      /* choose*/     \
+  "bif  v22.16b, v8.16b, v7.16b \n"      /* choose*/
+#define STORE                            /* save result */ \
+  "str q19, [%[outc0]], #16\n"                             \
+  "str q20, [%[outc1]], #16\n"                             \
+  "str q21, [%[outc2]], #16\n"                             \
+  "str q22, [%[outc3]], #16\n"
 
-void conv_depthwise_5x5s2p2(const float* din,
-                            float* dout,
-                            int num,
-                            int ch_out,
-                            int h_out,
-                            int w_out,
-                            int ch_in,
-                            int h_in,
-                            int w_in,
-                            const float* weights,
-                            const float* bias,
-                            bool flag_bias,
-                            bool flag_relu,
-                            ARMContext* ctx);
-
-void conv_depthwise_5x5s2p2_relu(const float* din,
-                                 float* dout,
-                                 int num,
-                                 int ch_out,
-                                 int h_out,
-                                 int w_out,
-                                 int ch_in,
-                                 int h_in,
-                                 int w_in,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 ARMContext* ctx);
-
-void conv_depthwise_5x5s2p2_s(const float* din,
-                              float* dout,
-                              int num,
-                              int ch_out,
-                              int h_out,
-                              int w_out,
-                              int ch_in,
-                              int h_in,
-                              int w_in,
-                              const float* weights,
-                              const float* bias,
-                              bool flag_bias,
-                              bool flag_relu,
-                              ARMContext* ctx);
-
-void conv_depthwise_5x5s2p2_relu_s(const float* din,
-                                   float* dout,
-                                   int num,
-                                   int ch_out,
-                                   int h_out,
-                                   int w_out,
-                                   int ch_in,
-                                   int h_in,
-                                   int w_in,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   ARMContext* ctx);
-
-void conv_depthwise_5x5s2_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int chout,
-                               int hout,
-                               int wout,
-                               int chin,
-                               int hin,
+#else
+#define COMPUTE                                                              \
+  /* fill with bias */                                                       \
+  "vld1.32  {d12-d13}, [%[bias]]\n" /* load bias */ /* load weights */       \
+  "vld1.32    {d14-d17}, [%[wc0]]!\n"               /* load w0-1, to q7-8 */ \
+  "vld1.32  {d0-d3},   [%[r0]]!\n"                  /* load input r0, 0,1*/  \
+  "vand.i32 q12,  q6, q6\n"                                                  \
+  "vld1.32  {d4-d7},   [%[r0]]!\n" /* load input r0, 2,3*/                   \
+  "vand.i32 q13,  q6, q6\n"                                                  \
+  "vld1.32  {d8-d11},  [%[r0]]!\n" /* load input r0, 4,5*/                   \
+  "vand.i32 q14,  q6, q6\n"                                                  \
+  "vand.i32 q15,  q6, q6\n"                                                  \
+  "vld1.32  {d12-d13}, [%[r0]]!\n" /* load input r0, 6*/                     \
+  "vmla.f32   q12, q7, q0               @ w0 * inr0\n"                       \
+  "vmla.f32   q13, q7, q2               @ w0 * inr2\n"                       \
+  "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w2-3, to q9-q10 */             \
+  "vmla.f32   q14, q7, q4               @ w0 * inr4\n"                       \
+  "vmla.f32   q15, q7, q6               @ w0 * inr6\n"                       \
+  "vmla.f32   q12, q8, q1              @ w1 * inr1\n"                        \
+  "vmla.f32   q13, q8, q3              @ w1 * inr3\n"                        \
+  "vmla.f32   q14, q8, q5              @ w1 * inr5\n"                        \
+  "vld1.32    {d22-d23}, [%[wc0]]!\n" /* load w4, to q11 */                  \
+  "vmla.f32   q12, q9, q2              @ w2 * inr2\n"                        \
+  "vmla.f32   q13, q9, q4              @ w2 * inr6\n"                        \
+  "vmla.f32   q14, q9, q6              @ w2 * inr4\n"                        \
+  "vld1.32 {d0-d3}, [%[r0]]! \n" /* load r0, 7-8 */                          \
+  "vmla.f32   q12, q10, q3              @ w3 * inr3\n"                       \
+  "vmla.f32   q13, q10, q5              @ w3 * inr5\n"                       \
+  "vmla.f32   q14, q10, q0              @ w3 * inr7\n"                       \
+  "vmla.f32   q15, q8, q0               @ w1 * inr7\n"                       \
+  "vld1.32 {d4-d7}, [%[r0]] \n" /* load r0, 9-10 */                          \
+  "vmla.f32   q12, q11, q4              @ w4 * inr4\n"                       \
+  "vmla.f32   q13, q11, q6              @ w4 * inr6\n"                       \
+  "vmla.f32   q14, q11, q1              @ w4 * inr8\n"                       \
+  "vmla.f32   q15, q9, q1               @ w2 * inr8\n"                       \
+  "vld1.32    {d0-d3}, [%[r1]]!         @ load r1, 0, 1\n"                   \
+  "vld1.32    {d14-d17}, [%[wc0]]!\n" /* load w0-1, to q7-8 */               \
+  "vmla.f32   q15, q10, q2               @ w3 * inr9\n"                      \
+  "vld1.32    {d4-d5}, [%[r1]]!         @ load r1, 2\n"                      \
+  "sub %[r0], %[r0], #16             @ r0 - 16 to nextline address\n"        \
+  "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w2-3, to q9-10 */              \
+  "vmla.f32   q12, q7, q0              @ w0 * inr0\n"                        \
+  "vmla.f32   q13, q7, q2              @ w0 * inr2\n"                        \
+  "vmla.f32   q15, q11, q3               @ w4 * inr10\n"                     \
+  "vld1.32    {d6-d9}, [%[r1]]!         @ load r1, 3, 4\n"                   \
+  "vld1.32    {d22-d23}, [%[wc0]]!\n" /* load w4, to q11 */                  \
+  "vld1.32    {d10-d13}, [%[r1]]!       @ load r1, 5, 6\n"                   \
+  "vmla.f32   q14, q7, q4              @ w0 * inr0\n"                        \
+  "vmla.f32   q15, q7, q6              @ w0 * inr2\n"                        \
+  "vmla.f32   q12, q8, q1              @ w1 * inr1\n"                        \
+  "vmla.f32   q13, q8, q3              @ w1 * inr3\n"                        \
+  "vld1.32    {d0-d3}, [%[r1]]!         @ load r1, 7, 8\n"                   \
+  "vmla.f32   q14, q8, q5              @ w1 * inr5\n"                        \
+  "vmla.f32   q15, q8, q0              @ w1 * inr7\n"                        \
+  "vmla.f32   q12, q9, q2              @ w2 * inr2\n"                        \
+  "vmla.f32   q13, q9, q4              @ w2 * inr4\n"                        \
+  "vmla.f32   q14, q9, q6              @ w2 * inr6\n"                        \
+  "vmla.f32   q15, q9, q1              @ w2 * inr8\n"                        \
+  "vmla.f32   q12, q10, q3              @ w3 * inr3\n"                       \
+  "vld1.32    {d4-d7}, [%[r1]]         @ load r1, 9, 10\n"                   \
+  "vmla.f32   q13, q10, q5              @ w3 * inr5\n"                       \
+  "vmla.f32   q14, q10, q0              @ w3 * inr7\n"                       \
+  "vmla.f32   q15, q10, q2              @ w3 * inr9\n"                       \
+  "vld1.32    {d14-d17}, [%[wc0]]!\n" /* load w0-1, to q7-8 */               \
+  "vmla.f32   q12, q11, q4              @ w4 * inr4\n"                       \
+  "vmla.f32   q13, q11, q6              @ w4 * inr6\n"                       \
+  "vmla.f32   q14, q11, q1              @ w4 * inr8\n"                       \
+  "vmla.f32   q15, q11, q3              @ w4 * inr10\n"                      \
+  "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 0, 1\n"                   \
+  "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w2-3, to q9-10 */              \
+  "sub %[r1], %[r1], #16                @ r1 - 16 to nextline address\n"     \
+  "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, 2, 3\n"                   \
+  "vld1.32    {d22-d23}, [%[wc0]]!\n" /* load w4 to q11 */                   \
+  "vmla.f32   q12, q7, q0              @ w0 * inr0\n"                        \
+  "vmla.f32   q13, q7, q2              @ w0 * inr2\n"                        \
+  "vld1.32    {d8-d11}, [%[r2]]!         @ load r2, 4, 5\n"                  \
+  "vmla.f32   q12, q8, q1              @ w1 * inr1\n"                        \
+  "vmla.f32   q13, q8, q3              @ w1 * inr3\n"                        \
+  "vld1.32    {d12-d13}, [%[r2]]!         @ load r2, 6 \n"                   \
+  "vmla.f32   q14, q7, q4              @ w0 * inr4\n"                        \
+  "vmla.f32   q15, q7, q6              @ w0 * inr6\n"                        \
+  "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 7, 8\n"                   \
+  "vmla.f32   q12, q9, q2              @ w2 * inr2\n"                        \
+  "vmla.f32   q13, q9, q4              @ w2 * inr4\n"                        \
+  "vmla.f32   q14, q8, q5              @ w1 * inr5\n"                        \
+  "vmla.f32   q15, q8, q0              @ w1 * inr7\n"                        \
+  "vmla.f32   q12, q10, q3              @ w3 * inr3\n"                       \
+  "vmla.f32   q13, q10, q5              @ w3 * inr5\n"                       \
+  "vmla.f32   q14, q9, q6              @ w2 * inr6\n"                        \
+  "vmla.f32   q15, q9, q1              @ w2 * inr8\n"                        \
+  "vld1.32    {d4-d7}, [%[r2]]         @ load r2, 9, 10\n"                   \
+  "vmla.f32   q12, q11, q4              @ w4 * inr4\n"                       \
+  "vmla.f32   q13, q11, q6              @ w4 * inr6\n"                       \
+  "vmla.f32   q14, q10, q0              @ w3 * inr7\n"                       \
+  "vmla.f32   q15, q10, q2              @ w3 * inr9\n"                       \
+  "vld1.32    {d14-d17}, [%[wc0]]!\n" /* load w0-1, to q7-8 */               \
+  "sub %[r2], %[r2], #16                @ r1 - 16 to nextline address\n"     \
+  "vmla.f32   q14, q11, q1              @ w4 * inr8\n"                       \
+  "vld1.32    {d0-d3}, [%[r3]]!         @ load r3, 0, 1\n"                   \
+  "vmla.f32   q15, q11, q3              @ w4 * inr10\n"                      \
+  "vld1.32    {d4-d7}, [%[r3]]!         @ load r3, 2, 3\n"                   \
+  "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w2-3, to q9-10 */              \
+  "vmla.f32   q12, q7, q0              @ w0 * inr0\n"                        \
+  "vmla.f32   q13, q7, q2              @ w0 * inr2\n"                        \
+  "vld1.32    {d8-d11}, [%[r3]]!         @ load r3, 4, 5\n"                  \
+  "vld1.32    {d22-d23}, [%[wc0]]!\n" /* load w4 to q11 */                   \
+  "vld1.32    {d12-d13}, [%[r3]]!         @ load r3, 6, \n"                  \
+  "vmla.f32   q12, q8, q1              @ w1 * inr1\n"                        \
+  "vmla.f32   q13, q8, q3              @ w1 * inr3\n"                        \
+  "vmla.f32   q14, q7, q4              @ w0 * inr4\n"                        \
+  "vmla.f32   q15, q7, q6              @ w0 * inr6\n"                        \
+  "vld1.32    {d0-d3}, [%[r3]]!         @ load r3, 7, 8\n"                   \
+  "vmla.f32   q12, q9, q2              @ w2 * inr2\n"                        \
+  "vmla.f32   q13, q9, q4              @ w2 * inr4\n"                        \
+  "vmla.f32   q14, q8, q5              @ w1 * inr5\n"                        \
+  "vmla.f32   q15, q8, q0              @ w1 * inr7\n"                        \
+  "vmla.f32   q12, q10, q3              @ w3 * inr3\n"                       \
+  "vld1.32    {d4-d7}, [%[r3]]         @ load r3, 9, 10\n"                   \
+  "vmla.f32   q13, q10, q5              @ w3 * inr5\n"                       \
+  "vmla.f32   q14, q9, q6              @ w2 * inr6\n"                        \
+  "vmla.f32   q15, q9, q1              @ w2 * inr8\n"                        \
+  "vmla.f32   q12, q11, q4              @ w4 * inr4\n"                       \
+  "vmla.f32   q13, q11, q6              @ w4 * inr6\n"                       \
+  "vmla.f32   q14, q10, q0              @ w3 * inr7\n"                       \
+  "vmla.f32   q15, q10, q2              @ w3 * inr9\n"                       \
+  "vld1.32    {d14-d17}, [%[wc0]]!\n" /* load w0-1, to q7-8 */               \
+  "sub %[r3], %[r3], #16                @ r1 - 16 to nextline address\n"     \
+  "vmla.f32   q14, q11, q1              @ w4 * inr8\n"                       \
+  "vld1.32    {d0-d3}, [%[r4]]!         @ load r4, 0, 1\n"                   \
+  "vmla.f32   q15, q11, q3              @ w4 * inr10\n"                      \
+  "vld1.32    {d4-d7}, [%[r4]]!         @ load r4, 2, 3\n"                   \
+  "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w2-3, to q9-10 */              \
+  "vmla.f32   q12, q7, q0              @ w0 * inr0\n"                        \
+  "vmla.f32   q13, q7, q2              @ w0 * inr2\n"                        \
+  "vld1.32    {d8-d11}, [%[r4]]!         @ load r3, 4, 5\n"                  \
+  "vld1.32    {d22-d23}, [%[wc0]]!\n" /* load w4 to q11 */                   \
+  "vld1.32    {d12-d13}, [%[r4]]!         @ load r3, 6, \n"                  \
+  "vmla.f32   q12, q8, q1              @ w1 * inr1\n"                        \
+  "vmla.f32   q13, q8, q3              @ w1 * inr3\n"                        \
+  "vmla.f32   q14, q7, q4              @ w0 * inr4\n"                        \
+  "vmla.f32   q15, q7, q6              @ w0 * inr6\n"                        \
+  "vld1.32    {d0-d3}, [%[r4]]!         @ load r3, 7, 8\n"                   \
+  "vmla.f32   q12, q9, q2              @ w2 * inr2\n"                        \
+  "vmla.f32   q13, q9, q4              @ w2 * inr4\n"                        \
+  "vmla.f32   q14, q8, q5              @ w1 * inr5\n"                        \
+  "vmla.f32   q15, q8, q0              @ w1 * inr7\n"                        \
+  "vmla.f32   q12, q10, q3              @ w3 * inr3\n"                       \
+  "vld1.32    {d4-d7}, [%[r4]]         @ load r3, 9, 10\n"                   \
+  "vmla.f32   q13, q10, q5              @ w3 * inr5\n"                       \
+  "vmla.f32   q14, q9, q6              @ w2 * inr6\n"                        \
+  "vmla.f32   q15, q9, q1              @ w2 * inr8\n"                        \
+  "vmla.f32   q12, q11, q4              @ w4 * inr4\n"                       \
+  "vmla.f32   q13, q11, q6              @ w4 * inr6\n"                       \
+  "vmla.f32   q14, q10, q0              @ w3 * inr7\n"                       \
+  "vmla.f32   q15, q10, q2              @ w3 * inr9\n"                       \
+  "sub    %[wc0], %[wc0], #400          @ wc0 - 400 to start address\n"      \
+  "sub %[r4], %[r4], #16                @ r1 - 16 to nextline address\n"     \
+  "vmla.f32   q14, q11, q1              @ w4 * inr8\n"                       \
+  "vmla.f32   q15, q11, q3              @ w4 * inr10\n"                      \
+  "vtrn.32 q12, q13\n" /* a0a1c0c1, b0b1d0d1*/                               \
+  "vtrn.32 q14, q15\n" /* a2a3c2c3, b2b3d2d3*/                               \
+  "vswp   d25, d28\n"  /* a0a1a2a3, c0c1c2c3*/                               \
+  "vswp   d27, d30\n"  /* b0b1b2b3, d0d1d2d3*/
+
+#define RELU /* relu */             \
+  "vmov.u32 q0, #0\n"               \
+  "vld1.32 {d2-d3}, [%[six_ptr]]\n" \
+  "vmax.f32 q12, q12, q0\n"         \
+  "vmax.f32 q13, q13, q0\n"         \
+  "vmax.f32 q14, q14, q0\n"         \
+  "vmax.f32 q15, q15, q0\n"
+#define RELU6 /* relu6 */   \
+  "vmin.f32 q12, q12, q1\n" \
+  "vmin.f32 q13, q13, q1\n" \
+  "vmin.f32 q14, q14, q1\n" \
+  "vmin.f32 q15, q15, q1\n"
+#define LEAKY_RELU /* LeakyRelu */    \
+  "vmov.u32 q0, #0\n"                 \
+  "vld1.32 {d2-d3}, [%[scale_ptr]]\n" \
+  "vcge.f32 q2, q12, q0  @ q0 > 0 \n" \
+  "vcge.f32 q4, q13, q0  @ q0 > 0 \n" \
+  "vcge.f32 q6, q14, q0  @ q0 > 0 \n" \
+  "vcge.f32 q8, q15, q0  @ q0 > 0 \n" \
+  "vmul.f32 q3, q12, q1   @ mul \n"   \
+  "vmul.f32 q5, q13, q1   @ mul \n"   \
+  "vmul.f32 q7, q14, q1   @ mul \n"   \
+  "vmul.f32 q9, q15, q1   @ mul \n"   \
+  "vbif q12, q3, q2 @ choose \n"      \
+  "vbif q13, q5, q4 @ choose \n"      \
+  "vbif q14, q7, q6 @ choose \n"      \
+  "vbif q15, q9, q8 @ choose \n"
+#define STORE                        /* save result */ \
+  "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/   \
+  "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/   \
+  "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/   \
+  "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
+
+#endif
+
+void act_switch_5x5s2(const float* inr0,
+                      const float* inr1,
+                      const float* inr2,
+                      const float* inr3,
+                      const float* inr4,
+                      float* outc0,
+                      float* outc1,
+                      float* outc2,
+                      float* outc3,
+                      float32x4_t w0,
+                      float32x4_t w1,
+                      float32x4_t w2,
+                      float32x4_t w3,
+                      float32x4_t w4,
+                      float32x4_t vbias,
+                      const float* weight_c,
+                      float* bias_local,
+                      const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+#ifdef __aarch64__
+    float32x4_t vsix = vdupq_n_f32(tmp);
+    float32x4_t vscale = vdupq_n_f32(ss);
+#else
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+#endif
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE RELU STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [inr3] "+r"(inr3),
+                       [inr4] "+r"(inr4),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [vbias] "w"(vbias)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+#else
+        asm volatile(COMPUTE RELU STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [r3] "+r"(inr3),
+                       [r4] "+r"(inr4),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [bias] "r"(bias_local), [six_ptr] "r"(vsix)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kRelu6:
+#ifdef __aarch64__
+        asm volatile(COMPUTE RELU RELU6 STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [inr3] "+r"(inr3),
+                       [inr4] "+r"(inr4),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [vbias] "w"(vbias),
+                       [vsix] "w"(vsix)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+#else
+        asm volatile(COMPUTE RELU RELU6 STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [r3] "+r"(inr3),
+                       [r4] "+r"(inr4),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [bias] "r"(bias_local), [six_ptr] "r"(vsix)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE LEAKY_RELU STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [inr3] "+r"(inr3),
+                       [inr4] "+r"(inr4),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [vbias] "w"(vbias),
+                       [vscale] "w"(vscale)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+#else
+        asm volatile(COMPUTE LEAKY_RELU STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [r3] "+r"(inr3),
+                       [r4] "+r"(inr4),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [bias] "r"(bias_local), [scale_ptr] "r"(vscale)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(COMPUTE STORE
+                 : [inr0] "+r"(inr0),
+                   [inr1] "+r"(inr1),
+                   [inr2] "+r"(inr2),
+                   [inr3] "+r"(inr3),
+                   [inr4] "+r"(inr4),
+                   [wc0] "+r"(weight_c),
+                   [outc0] "+r"(outc0),
+                   [outc1] "+r"(outc1),
+                   [outc2] "+r"(outc2),
+                   [outc3] "+r"(outc3)
+                 : [w0] "w"(w0),
+                   [w1] "w"(w1),
+                   [w2] "w"(w2),
+                   [w3] "w"(w3),
+                   [w4] "w"(w4),
+                   [vbias] "w"(vbias)
+                 : "cc",
+                   "memory",
+                   "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v14",
+                   "v15",
+                   "v16",
+                   "v17",
+                   "v18",
+                   "v19",
+                   "v20",
+                   "v21",
+                   "v22");
+#else
+    asm volatile(COMPUTE STORE
+                 : [r0] "+r"(inr0),
+                   [r1] "+r"(inr1),
+                   [r2] "+r"(inr2),
+                   [r3] "+r"(inr3),
+                   [r4] "+r"(inr4),
+                   [wc0] "+r"(weight_c),
+                   [outc0] "+r"(outc0),
+                   [outc1] "+r"(outc1),
+                   [outc2] "+r"(outc2),
+                   [outc3] "+r"(outc3)
+                 : [bias] "r"(bias_local)
+                 : "cc",
+                   "memory",
+                   "q0",
+                   "q1",
+                   "q2",
+                   "q3",
+                   "q4",
+                   "q5",
+                   "q6",
+                   "q7",
+                   "q8",
+                   "q9",
+                   "q10",
+                   "q11",
+                   "q12",
+                   "q13",
+                   "q14",
+                   "q15");
+#endif
+  }
+}
+void conv_depthwise_5x5s2_fp32(const float* i_data,
+                               float* o_data,
+                               int bs,
+                               int oc,
+                               int oh,
+                               int ow,
+                               int ic,
+                               int ih,
                                int win,
                                const float* weights,
                                const float* bias,
-                               int pad,
-                               bool flag_bias,
-                               bool flag_relu,
+                               const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
                                ARMContext* ctx) {
-  if (pad == 2) {
-    if (win >= 9) {
-      if (flag_relu) {
-        conv_depthwise_5x5s2p2_relu(din,
-                                    dout,
-                                    num,
-                                    chout,
-                                    hout,
-                                    wout,
-                                    chin,
-                                    hin,
-                                    win,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    flag_relu,
-                                    ctx);
-      } else {
-        conv_depthwise_5x5s2p2(din,
-                               dout,
-                               num,
-                               chout,
-                               hout,
-                               wout,
-                               chin,
-                               hin,
-                               win,
-                               weights,
-                               bias,
-                               flag_bias,
-                               flag_relu,
-                               ctx);
-      }
-    } else {
-      if (flag_relu) {
-        conv_depthwise_5x5s2p2_relu_s(din,
-                                      dout,
-                                      num,
-                                      chout,
-                                      hout,
-                                      wout,
-                                      chin,
-                                      hin,
-                                      win,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      flag_relu,
-                                      ctx);
-      } else {
-        conv_depthwise_5x5s2p2_s(din,
-                                 dout,
-                                 num,
-                                 chout,
-                                 hout,
-                                 wout,
-                                 chin,
-                                 hin,
-                                 win,
-                                 weights,
-                                 bias,
-                                 flag_bias,
-                                 flag_relu,
-                                 ctx);
+  auto paddings = *param.paddings;
+  int threads = ctx->threads();
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
+  const int out_c_block = 4;
+  const int out_h_kernel = 1;
+  const int out_w_kernel = 4;
+  const int win_ext = ow * 2 + 3;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh * 2 + 3;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size = threads * prein_size + win_round + ow_round;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  auto ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
+#else
+      float* pre_din = ptr_write + ow_round;
+#endif
+      /// const array size
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 25;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
       }
-    }
-  }
-}
-
 #ifdef __aarch64__
-
-//! larger depthwise, win >= 9;
-void conv_depthwise_5x5s2p2(const float* din,
-                            float* dout,
-                            int num,
-                            int ch_out,
-                            int h_out,
-                            int w_out,
-                            int ch_in,
-                            int h_in,
-                            int w_in,
-                            const float* weights,
-                            const float* bias,
-                            bool flag_bias,
-                            bool flag_relu,
-                            ARMContext* ctx) {
-  CHECK_GE(w_in, 9) << "only support win >= 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int cnt = (w_out_round - 4) / 4;
-  int mid_cnt = cnt - 1;
-  int right_start = cnt * 2 * 4 - 2;
-  int mask_cnt = 12 - (w_in - right_start);
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-      const float* din5 = din4 + w_in;
-      const float* din6 = din5 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      for (int h = 0; h < h_out; h += 2) {
-        //! (h * 2 - 2) + 6 > h_in - 1
-        if (h * 2 + 5 > h_in) {
-          switch (h * 2 + 5 - h_in) {
-            case 6:
-              din1 = zero_ptr;
-            case 5:
-              din2 = zero_ptr;
-            case 4:
-              din3 = zero_ptr;
-            case 3:
-              din4 = zero_ptr;
-            case 2:
-              din5 = zero_ptr;
-            case 1:
-              din6 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 2 > h_out) {
-          switch (h + 2 - h_out) {
-            case 1:
-              dout1 = write_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-        const float* din_ptr6 = din6;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        int loop = mid_cnt;
-        const int s_8 = 8;
-        const int s_16 = 16;
-
-        //! in r0, r1/r4, r2/r5, r3/r6: x 0 2 4 -- v8   v13  v18  v23
-        //! in r0, r1/r4, r2/r5, r3/r6: x 1 3 5 -- v9   v14  v19  v24
-        //! in r0, r1/r4, r2/r5, r3/r6: 0 2 4 6 -- v6   v11  v16  v21
-        //! in r0, r1/r4, r2/r5, r3/r6: 1 3 5 7 -- v7   v12  v17  v22
-        //! in r0, r1/r4, r2/r5, r3/r6: 2 4 6 8 -- v10  v15  v20  v25
-        //! out r0, r1 -- v26, v27
-        asm volatile(
-            "movi   v31.4s, #0x0\n"
-            "prfm pldl1keep, [%[din_ptr0]]  \n"
-            "prfm pldl1keep, [%[din_ptr1]]  \n"
-            "prfm pldl1keep, [%[din_ptr2]]  \n"
-            "prfm pldl1keep, [%[din_ptr3]]  \n"
-            "prfm pldl1keep, [%[din_ptr4]]  \n"
-            "prfm pldl1keep, [%[din_ptr5]]  \n"
-            "prfm pldl1keep, [%[din_ptr6]]  \n"
-            "prfm pldl1keep, [%[weights]]   \n"
-            "prfm pldl1keep, [%[mask]]      \n"
-            // left
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], #32             \n"  // r0 v6: 0
-                                                                     // 2 4 6,
-                                                                     // v7: 1 3
-                                                                     // 5 7
-            "ext v8.16b, v31.16b, v6.16b, #12                   \n"  // r0 v8: x
-                                                                     // 0 2 4
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32           \n"  // r1 v11:
-                                                                     // 0 2 4 6,
-                                                                     // v12: 1 3
-                                                                     // 5 7
-            "ext v9.16b, v31.16b, v7.16b, #12                   \n"  // r0 v9: x
-                                                                     // 1 3 5
-            "ld1 {v0.4s, v1.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 0-7
-            "ext v10.16b, v6.16b, v31.16b, #4                   \n"
-            "ld1 {v10.s}[3], [%[din_ptr0]]                      \n"  // r0 v10:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr0], %[din_ptr0], #8                   \n"
-            "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r1 v13:
-                                                                     // x 0 2 4
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32           \n"  // r2 v16:
-                                                                     // 0 2 4 6,
-                                                                     // v17: 1 3
-                                                                     // 5 7
-            "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r1 v14:
-                                                                     // x 1 3 5
-            "ld1 {v2.4s, v3.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 8-15
-            "ext v15.16b, v11.16b, v31.16b, #4                  \n"
-            "ld1 {v15.s}[3], [%[din_ptr1]]                      \n"  // r1 v15:
-                                                                     // 2 4 6
-            "sub %[din_ptr1], %[din_ptr1], #8                   \n"
-            "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r2 v18:
-                                                                     // x 0 2 4
-            "ld1 {v4.4s, v5.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 16-23
-            "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r2 v19:
-                                                                     // x 1 3 5
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32           \n"  // r3 v21:
-                                                                     // 0 2 4 6,
-                                                                     // v22: 1 3
-                                                                     // 5 7
-            "ext v20.16b, v16.16b, v31.16b, #4                  \n"
-            "ld1 {v20.s}[3], [%[din_ptr2]]                      \n"  // r2 v20:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr2], %[din_ptr2], #8                   \n"
-            "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r3 v23:
-                                                                     // x 0 2 4
-            "ld1 {v30.4s}, [%[weights]]                         \n"  // load
-                                                                     // weights
-                                                                     // 24
-            "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r3 v24:
-                                                                     // x 1 3 5
-            "ld1 {v26.4s}, [%[vbias]]                           \n"  // load
-                                                                     // bias to
-                                                                     // out_r0
-            "ext v25.16b, v21.16b, v31.16b, #4                  \n"
-            "ld1 {v25.s}[3], [%[din_ptr3]]                      \n"  // r2 v25:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr3], %[din_ptr3], #8                   \n"
-            "mov v27.16b, v26.16b                               \n"  // load
-                                                                     // bias to
-                                                                     // out_r1
-            "mov v28.16b, v31.16b                               \n"  // load
-                                                                     // zero to
-                                                                     // out_r0
-            "mov v29.16b, v31.16b                               \n"  // load
-                                                                     // zero to
-                                                                     // out_r1
-
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"  // out r0:
-                                                                     // w0
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"  // out r0:
-                                                                     // w1
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"  // out r0:
-                                                                     // w2
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"  // out r0:
-                                                                     // w3
-
-            "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0
-                                                                     // v8: 0 2
-                                                                     // 4 6, v9:
-                                                                     // 1 3 5 7
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"  // out r0:
-                                                                     // w4
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"  // out r0:
-                                                                     // w5
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"  // out r0:
-                                                                     // w6
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"  // out r0:
-                                                                     // w7
-
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0
-                                                                     // v6: 2 4
-                                                                     // 6 8, v7:
-                                                                     // 3 5 7 9
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"  // out r0:
-                                                                     // w8
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"  // out r0:
-                                                                     // w9
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"  // out r0:
-                                                                     // w10
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"  // out r0:
-                                                                     // w11
-
-            "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"  // next r0
-                                                                     // v10: 4 6
-                                                                     // 8 10,
-                                                                     // v11:
-                                                                     // trash
-                                                                     // register
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"  // out r0:
-                                                                     // w12
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"  // out r0:
-                                                                     // w13
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"  // out r0:
-                                                                     // w14
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"  // out r0:
-                                                                     // w15
-            "prfm pldl1keep, [%[din_ptr0]]                      \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], #32           \n"  // r4 v11:
-                                                                     // 0 2 4 6,
-                                                                     // v12: 1 3
-                                                                     // 5 7
-
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"  // out r0:
-                                                                     // w16
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"  // out r0:
-                                                                     // w17
-
-            "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r4 v13:
-                                                                     // x 0 2 4
-            "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r4 v14:
-                                                                     // x 1 3 5
-            "ext v15.16b, v11.16b, v31.16b, #4                  \n"
-
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"  // out r0:
-                                                                     // w18
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"  // out r0:
-                                                                     // w19
-
-            "ld1 {v15.s}[3], [%[din_ptr4]]                      \n"  // r4 v15:
-                                                                     // 2 4 6
-
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"  // out r1:
-                                                                     // w0
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"  // out r1:
-                                                                     // w1
-
-            "sub %[din_ptr4], %[din_ptr4], #8                   \n"
-
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"  // out r1:
-                                                                     // w2
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"  // out r1:
-                                                                     // w3
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"  // out r1:
-                                                                     // w4
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"  // out r1:
-                                                                     // w5
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], #32           \n"  // r5 v16:
-                                                                     // 0 2 4 6,
-                                                                     // v17: 1 3
-                                                                     // 5 7
-
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"  // out r1:
-                                                                     // w6
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"  // out r1:
-                                                                     // w7
-
-            "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r5 v18:
-                                                                     // x 0 2 4
-            "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r5 v19:
-                                                                     // x 1 3 5
-            "ext v20.16b, v16.16b, v31.16b, #4                  \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"  // out r1:
-                                                                     // w8
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"  // out r1:
-                                                                     // w9
-
-            "ld1 {v20.s}[3], [%[din_ptr5]]                      \n"  // r5 v20:
-                                                                     // 2 4 6
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], #32           \n"  // r6 v21:
-                                                                     // 0 2 4 6,
-                                                                     // v22: 1 3
-                                                                     // 5 7
-
-            "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r6 v23:
-                                                                     // x 0 2 4
-            "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r6 v24:
-                                                                     // x 1 3 5
-            "ext v25.16b, v21.16b, v31.16b, #4                  \n"
-            "sub %[din_ptr5], %[din_ptr5], #8                   \n"
-
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"  // out r0:
-                                                                     // w22
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"  // out r0:
-                                                                     // w23
-
-            "ld1 {v25.s}[3], [%[din_ptr6]]                      \n"  // r6 v25:
-                                                                     // 2 4 6
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"  // out r0:
-                                                                     // w20
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"  // out r0:
-                                                                     // w21
-
-            "sub %[din_ptr6], %[din_ptr6], #8                   \n"
-
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"  // out r0:
-                                                                     // w24
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"  // out r1:
-                                                                     // w10
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"  // out r1:
-                                                                     // w11
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1
-                                                                     // v13: 0 2
-                                                                     // 4 6,
-                                                                     // v14: 1 3
-                                                                     // 5 7
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"  // out r1:
-                                                                     // w12
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"  // out r1:
-                                                                     // w13
-
-            "st1 {v26.4s}, [%[dout_ptr0]], %[s_16]              \n"  // store
-                                                                     // output
-                                                                     // r0
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1
-                                                                     // v11: 2 4
-                                                                     // 6 8,
-                                                                     // v12: 3 5
-                                                                     // 7 9
-
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"  // out r1:
-                                                                     // w14
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"  // out r1:
-                                                                     // w17
-            "fmla v27.4s, v18.4s, v3.s[3]                       \n"  // out r1:
-                                                                     // w15
-            "fmla v29.4s, v19.4s, v4.s[0]                       \n"  // out r1:
-                                                                     // w16
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"  // next r1
-                                                                     // v15: 4 6
-                                                                     // 8 10,
-                                                                     // v16:
-                                                                     // trash
-                                                                     // register
-
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"  // out r1:
-                                                                     // w18
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"  // out r1:
-                                                                     // w19
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2
-                                                                     // v18: 0 2
-                                                                     // 4 6,
-                                                                     // v19: 1 3
-                                                                     // 5 7
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2
-                                                                     // v16: 2 4
-                                                                     // 6 8,
-                                                                     // v11: 3 5
-                                                                     // 7 9
-
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"  // out r1:
-                                                                     // w20
-            "fmla v29.4s, v21.4s, v5.s[2]                       \n"  // out r1:
-                                                                     // w22
-            "fmla v27.4s, v24.4s, v5.s[1]                       \n"  // out r1:
-                                                                     // w21
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"  // out r1:
-                                                                     // w23
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"  // next r2
-                                                                     // v20: 4 6
-                                                                     // 8 10,
-                                                                     // v21:
-                                                                     // trash
-                                                                     // register
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3
-                                                                     // v23: 0 2
-                                                                     // 4 6,
-                                                                     // v24: 1 3
-                                                                     // 5 7
-
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"  // out r1:
-                                                                     // w24
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3
-                                                                     // v21: 2 4
-                                                                     // 6 8,
-                                                                     // v22: 3 5
-                                                                     // 7 9
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"  // next r3
-                                                                     // v25: 4 6
-                                                                     // 8 10,
-                                                                     // v26:
-                                                                     // trash
-                                                                     // register
-
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-            "cmp %w[mid_cnt], #1                                \n"
-
-            "prfm pldl1keep, [%[din_ptr1]]                      \n"
-            "prfm pldl1keep, [%[din_ptr2]]                      \n"
-            "prfm pldl1keep, [%[din_ptr3]]                      \n"
-
-            "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"
-            "blt 2f                                             \n"
-
-            // mid loop
-            "1:                                                 \n"
-            "ld1 {v26.4s}, [%[vbias]]                           \n"
-            "mov v27.16b, v26.16b                               \n"
-            "mov v28.16b, v31.16b                               \n"
-            "mov v29.16b, v31.16b                               \n"
-
-            // out_r0 r0-r3
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"
-
-            "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"
-
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"
-
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"
-
-            "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr0]]                      \n"
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr4]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr4]]                      \n"
-
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"
-
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr5]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr5]]                      \n"
-
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"
-
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"
-            "fadd v28.4s, v26.4s, v28.4s                        \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr6]], %[s_16]       \n"
-            "mov v26.16b, v31.16b                               \n"
-            "prfm pldl1keep, [%[din_ptr6]]                      \n"
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"
-
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"
-
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr1]]                      \n"
-
-            "fmla v29.4s, v18.4s, v3.s[3]                       \n"
-            "fmla v27.4s, v19.4s, v4.s[0]                       \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"
-
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"
-
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"
-            "fmla v27.4s, v21.4s, v5.s[2]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"
-
-            "fmla v29.4s, v24.4s, v5.s[1]                       \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"
-            "prfm pldl1keep, [%[din_ptr2]]                      \n"
-
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"
-
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-
-            "prfm pldl1keep, [%[din_ptr3]]                      \n"
-
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-
-            "st1 {v26.4s}, [%[dout_ptr0]], #16                  \n"
-            "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"
-            "subs %w[mid_cnt], %w[mid_cnt], #1                  \n"
-            "bne 1b                                             \n"
-
-            "2:                                                 \n"
-            "ld2 {v26.4s, v27.4s}, [%[mask]], %[s_8]            \n"
-            "ld2 {v28.4s, v29.4s}, [%[mask]], %[s_8]            \n"
-            "bif v8.16b, v31.16b, v26.16b                       \n"
-            "bif v9.16b, v31.16b, v27.16b                       \n"
-            "bif v6.16b, v31.16b, v28.16b                       \n"
-            "bif v7.16b, v31.16b, v29.16b                       \n"
-
-            "bif v13.16b, v31.16b, v26.16b                      \n"
-            "bif v14.16b, v31.16b, v27.16b                      \n"
-            "bif v11.16b, v31.16b, v28.16b                      \n"
-            "bif v12.16b, v31.16b, v29.16b                      \n"
-
-            "bif v18.16b, v31.16b, v26.16b                      \n"
-            "bif v19.16b, v31.16b, v27.16b                      \n"
-            "bif v16.16b, v31.16b, v28.16b                      \n"
-            "bif v17.16b, v31.16b, v29.16b                      \n"
-
-            "bif v23.16b, v31.16b, v26.16b                      \n"
-            "bif v24.16b, v31.16b, v27.16b                      \n"
-            "bif v21.16b, v31.16b, v28.16b                      \n"
-            "bif v22.16b, v31.16b, v29.16b                      \n"
-
-            "ld2 {v28.4s, v29.4s}, [%[mask]]                    \n"
-            "ld1 {v26.4s}, [%[vbias]]                           \n"
-            "mov v29.16b, v31.16b                               \n"
-
-            "bif v10.16b, v31.16b, v28.16b                      \n"
-            "bif v15.16b, v31.16b, v28.16b                      \n"
-
-            "mov v27.16b, v26.16b                               \n"
-
-            "bif v20.16b, v31.16b, v28.16b                      \n"
-            "bif v25.16b, v31.16b, v28.16b                      \n"
-            "mov v28.16b, v31.16b                               \n"
-
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"
-
-            "sub %[mask], %[mask], #16                          \n"
-            "ld2 {v6.4s, v7.4s}, [%[mask]], %[s_8]              \n"
-            "ld2 {v8.4s, v9.4s}, [%[mask]], %[s_8]              \n"
-            "ld2 {v10.4s, v11.4s}, [%[mask]]                    \n"
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr4]]                \n"
-
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"
-
-            "bif  v13.16b, v31.16b, v6.16b                      \n"
-            "bif  v14.16b, v31.16b, v7.16b                      \n"
-            "bif  v11.16b, v31.16b, v8.16b                      \n"
-            "bif  v12.16b, v31.16b, v9.16b                      \n"
-            "bif  v15.16b, v31.16b, v10.16b                     \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr5]]                \n"
-
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"
-
-            "bif  v18.16b, v31.16b, v6.16b                      \n"
-            "bif  v19.16b, v31.16b, v7.16b                      \n"
-            "bif  v16.16b, v31.16b, v8.16b                      \n"
-            "bif  v17.16b, v31.16b, v9.16b                      \n"
-            "bif  v20.16b, v31.16b, v10.16b                     \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"
-            "fadd v28.4s, v28.4s, v26.4s                        \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr6]]                \n"
-            "mov v26.16b, v31.16b                               \n"
-
-            "bif  v23.16b, v31.16b, v6.16b                      \n"
-            "bif  v24.16b, v31.16b, v7.16b                      \n"
-            "bif  v21.16b, v31.16b, v8.16b                      \n"
-            "bif  v22.16b, v31.16b, v9.16b                      \n"
-            "bif  v25.16b, v31.16b, v10.16b                     \n"
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"
-
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"
-            "fmla v29.4s, v18.4s, v3.s[3]                       \n"
-            "fmla v27.4s, v19.4s, v4.s[0]                       \n"
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"
-
-            "st1 {v26.4s}, [%[out_buf0]]                        \n"
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"
-            "fmla v29.4s, v24.4s, v5.s[1]                       \n"
-
-            "fmla v27.4s, v21.4s, v5.s[2]                       \n"
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-
-            "st1 {v27.4s}, [%[out_buf1]]                        \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [dout_ptr1] "+r"(dout_ptr1),
-              [mid_cnt] "+r"(loop),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [din_ptr6] "+r"(din_ptr6),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [out_buf1] "r"(out_buf1),
-              [s_8] "r"(s_8),
-              [s_16] "r"(s_16)
-            : "memory",
-              "cc",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25",
-              "v26",
-              "v27",
-              "v28",
-              "v29",
-              "v30",
-              "v31");
-
-        int remain_cnt = w_out - (mid_cnt + 1) * 4;
-        for (int i = 0; i < remain_cnt; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-          dout_ptr1[i] = out_buf1[i];
-        }
-        din0 = din4;
-        din1 = din5;
-        din2 = din6;
-        din3 = din6 + w_in;
-        din4 = din3 + w_in;
-        din5 = din4 + w_in;
-        din6 = din5 + w_in;
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t vbias = vdupq_n_f32(0.f);
+      if (flag_bias) {
+        vbias = vld1q_f32(&bias[c]);  // v28
       }
-    }
-  }
-}
-
-//! larger depthwise, win >= 9;
-void conv_depthwise_5x5s2p2_relu(const float* din,
-                                 float* dout,
-                                 int num,
-                                 int ch_out,
-                                 int h_out,
-                                 int w_out,
-                                 int ch_in,
-                                 int h_in,
-                                 int w_in,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 ARMContext* ctx) {
-  CHECK_GE(w_in, 9) << "only support win >= 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int cnt = (w_out_round - 4) / 4;
-  int mid_cnt = cnt - 1;
-  int right_start = cnt * 2 * 4 - 2;
-  int mask_cnt = 12 - (w_in - right_start);
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-      const float* din5 = din4 + w_in;
-      const float* din6 = din5 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      for (int h = 0; h < h_out; h += 2) {
-        //! (h * 2 - 2) + 6 > h_in - 1
-        if (h * 2 + 5 > h_in) {
-          switch (h * 2 + 5 - h_in) {
-            case 6:
-              din1 = zero_ptr;
-            case 5:
-              din2 = zero_ptr;
-            case 4:
-              din3 = zero_ptr;
+      weight_c += 20;
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc0 = dout_c00 + h * ow;
+        float* outc1 = outc0 + size_out_channel;
+        float* outc2 = outc1 + size_out_channel;
+        float* outc3 = outc2 + size_out_channel;
+        const float* inr0 = pre_din + h * 2 * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        const float* inr3 = inr2 + row_len;
+        const float* inr4 = inr3 + row_len;
+
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
             case 3:
-              din4 = zero_ptr;
+              outc1 = ptr_write;
             case 2:
-              din5 = zero_ptr;
+              outc2 = ptr_write;
             case 1:
-              din6 = zero_ptr;
+              outc3 = ptr_write;
             default:
               break;
           }
         }
-        if (h + 2 > h_out) {
-          switch (h + 2 - h_out) {
-            case 1:
-              dout1 = write_ptr;
-            default:
-              break;
+        auto c0 = outc0;
+        auto c1 = outc1;
+        auto c2 = outc2;
+        auto c3 = outc3;
+        float pre_out[16];
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          if (flag_mask) {
+            c0 = outc0;
+            c1 = outc1;
+            c2 = outc2;
+            c3 = outc3;
+            outc0 = pre_out;
+            outc1 = pre_out + 4;
+            outc2 = pre_out + 8;
+            outc3 = pre_out + 12;
           }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-        const float* din_ptr6 = din6;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        int loop = mid_cnt;
-        const int s_8 = 8;
-        const int s_16 = 16;
-
-        //! in r0, r1/r4, r2/r5, r3/r6: x 0 2 4 -- v8   v13  v18  v23
-        //! in r0, r1/r4, r2/r5, r3/r6: x 1 3 5 -- v9   v14  v19  v24
-        //! in r0, r1/r4, r2/r5, r3/r6: 0 2 4 6 -- v6   v11  v16  v21
-        //! in r0, r1/r4, r2/r5, r3/r6: 1 3 5 7 -- v7   v12  v17  v22
-        //! in r0, r1/r4, r2/r5, r3/r6: 2 4 6 8 -- v10  v15  v20  v25
-        //! out r0, r1 -- v26, v27
-        asm volatile(
-            "movi   v31.4s, #0x0\n"
-            "prfm pldl1keep, [%[din_ptr0]]  \n"
-            "prfm pldl1keep, [%[din_ptr1]]  \n"
-            "prfm pldl1keep, [%[din_ptr2]]  \n"
-            "prfm pldl1keep, [%[din_ptr3]]  \n"
-            "prfm pldl1keep, [%[din_ptr4]]  \n"
-            "prfm pldl1keep, [%[din_ptr5]]  \n"
-            "prfm pldl1keep, [%[din_ptr6]]  \n"
-            "prfm pldl1keep, [%[weights]]   \n"
-            "prfm pldl1keep, [%[mask]]      \n"
-            // left
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], #32             \n"  // r0 v6: 0
-                                                                     // 2 4 6,
-                                                                     // v7: 1 3
-                                                                     // 5 7
-            "ext v8.16b, v31.16b, v6.16b, #12                   \n"  // r0 v8: x
-                                                                     // 0 2 4
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32           \n"  // r1 v11:
-                                                                     // 0 2 4 6,
-                                                                     // v12: 1 3
-                                                                     // 5 7
-            "ext v9.16b, v31.16b, v7.16b, #12                   \n"  // r0 v9: x
-                                                                     // 1 3 5
-            "ld1 {v0.4s, v1.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 0-7
-            "ext v10.16b, v6.16b, v31.16b, #4                   \n"
-            "ld1 {v10.s}[3], [%[din_ptr0]]                      \n"  // r0 v10:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr0], %[din_ptr0], #8                   \n"
-            "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r1 v13:
-                                                                     // x 0 2 4
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32           \n"  // r2 v16:
-                                                                     // 0 2 4 6,
-                                                                     // v17: 1 3
-                                                                     // 5 7
-            "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r1 v14:
-                                                                     // x 1 3 5
-            "ld1 {v2.4s, v3.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 8-15
-            "ext v15.16b, v11.16b, v31.16b, #4                  \n"
-            "ld1 {v15.s}[3], [%[din_ptr1]]                      \n"  // r1 v15:
-                                                                     // 2 4 6
-            "sub %[din_ptr1], %[din_ptr1], #8                   \n"
-            "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r2 v18:
-                                                                     // x 0 2 4
-            "ld1 {v4.4s, v5.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 16-23
-            "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r2 v19:
-                                                                     // x 1 3 5
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32           \n"  // r3 v21:
-                                                                     // 0 2 4 6,
-                                                                     // v22: 1 3
-                                                                     // 5 7
-            "ext v20.16b, v16.16b, v31.16b, #4                  \n"
-            "ld1 {v20.s}[3], [%[din_ptr2]]                      \n"  // r2 v20:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr2], %[din_ptr2], #8                   \n"
-            "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r3 v23:
-                                                                     // x 0 2 4
-            "ld1 {v30.4s}, [%[weights]]                         \n"  // load
-                                                                     // weights
-                                                                     // 24
-            "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r3 v24:
-                                                                     // x 1 3 5
-            "ld1 {v26.4s}, [%[vbias]]                           \n"  // load
-                                                                     // bias to
-                                                                     // out_r0
-            "ext v25.16b, v21.16b, v31.16b, #4                  \n"
-            "ld1 {v25.s}[3], [%[din_ptr3]]                      \n"  // r2 v25:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr3], %[din_ptr3], #8                   \n"
-            "mov v27.16b, v26.16b                               \n"  // load
-                                                                     // bias to
-                                                                     // out_r1
-            "mov v28.16b, v31.16b                               \n"  // load
-                                                                     // zero to
-                                                                     // out_r0
-            "mov v29.16b, v31.16b                               \n"  // load
-                                                                     // zero to
-                                                                     // out_r1
-
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"  // out r0:
-                                                                     // w0
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"  // out r0:
-                                                                     // w1
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"  // out r0:
-                                                                     // w2
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"  // out r0:
-                                                                     // w3
-
-            "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0
-                                                                     // v8: 0 2
-                                                                     // 4 6, v9:
-                                                                     // 1 3 5 7
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"  // out r0:
-                                                                     // w4
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"  // out r0:
-                                                                     // w5
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"  // out r0:
-                                                                     // w6
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"  // out r0:
-                                                                     // w7
-
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0
-                                                                     // v6: 2 4
-                                                                     // 6 8, v7:
-                                                                     // 3 5 7 9
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"  // out r0:
-                                                                     // w8
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"  // out r0:
-                                                                     // w9
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"  // out r0:
-                                                                     // w10
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"  // out r0:
-                                                                     // w11
-
-            "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"  // next r0
-                                                                     // v10: 4 6
-                                                                     // 8 10,
-                                                                     // v11:
-                                                                     // trash
-                                                                     // register
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"  // out r0:
-                                                                     // w12
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"  // out r0:
-                                                                     // w13
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"  // out r0:
-                                                                     // w14
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"  // out r0:
-                                                                     // w15
-            "prfm pldl1keep, [%[din_ptr0]]                      \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], #32           \n"  // r4 v11:
-                                                                     // 0 2 4 6,
-                                                                     // v12: 1 3
-                                                                     // 5 7
-
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"  // out r0:
-                                                                     // w16
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"  // out r0:
-                                                                     // w17
-
-            "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r4 v13:
-                                                                     // x 0 2 4
-            "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r4 v14:
-                                                                     // x 1 3 5
-            "ext v15.16b, v11.16b, v31.16b, #4                  \n"
-
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"  // out r0:
-                                                                     // w18
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"  // out r0:
-                                                                     // w19
-
-            "ld1 {v15.s}[3], [%[din_ptr4]]                      \n"  // r4 v15:
-                                                                     // 2 4 6
-
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"  // out r1:
-                                                                     // w0
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"  // out r1:
-                                                                     // w1
-
-            "sub %[din_ptr4], %[din_ptr4], #8                   \n"
-
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"  // out r1:
-                                                                     // w2
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"  // out r1:
-                                                                     // w3
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"  // out r1:
-                                                                     // w4
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"  // out r1:
-                                                                     // w5
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], #32           \n"  // r5 v16:
-                                                                     // 0 2 4 6,
-                                                                     // v17: 1 3
-                                                                     // 5 7
-
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"  // out r1:
-                                                                     // w6
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"  // out r1:
-                                                                     // w7
-
-            "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r5 v18:
-                                                                     // x 0 2 4
-            "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r5 v19:
-                                                                     // x 1 3 5
-            "ext v20.16b, v16.16b, v31.16b, #4                  \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"  // out r1:
-                                                                     // w8
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"  // out r1:
-                                                                     // w9
-
-            "ld1 {v20.s}[3], [%[din_ptr5]]                      \n"  // r5 v20:
-                                                                     // 2 4 6
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], #32           \n"  // r6 v21:
-                                                                     // 0 2 4 6,
-                                                                     // v22: 1 3
-                                                                     // 5 7
-
-            "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r6 v23:
-                                                                     // x 0 2 4
-            "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r6 v24:
-                                                                     // x 1 3 5
-            "ext v25.16b, v21.16b, v31.16b, #4                  \n"
-            "sub %[din_ptr5], %[din_ptr5], #8                   \n"
-
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"  // out r0:
-                                                                     // w22
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"  // out r0:
-                                                                     // w23
-
-            "ld1 {v25.s}[3], [%[din_ptr6]]                      \n"  // r6 v25:
-                                                                     // 2 4 6
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"  // out r0:
-                                                                     // w20
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"  // out r0:
-                                                                     // w21
-
-            "sub %[din_ptr6], %[din_ptr6], #8                   \n"
-
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"  // out r0:
-                                                                     // w24
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"  // out r1:
-                                                                     // w10
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"  // out r1:
-                                                                     // w11
-            "fmax v26.4s, v26.4s, v31.4s                        \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1
-                                                                     // v13: 0 2
-                                                                     // 4 6,
-                                                                     // v14: 1 3
-                                                                     // 5 7
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"  // out r1:
-                                                                     // w12
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"  // out r1:
-                                                                     // w13
-
-            "st1 {v26.4s}, [%[dout_ptr0]], %[s_16]              \n"  // store
-                                                                     // output
-                                                                     // r0
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1
-                                                                     // v11: 2 4
-                                                                     // 6 8,
-                                                                     // v12: 3 5
-                                                                     // 7 9
-
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"  // out r1:
-                                                                     // w14
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"  // out r1:
-                                                                     // w17
-            "fmla v27.4s, v18.4s, v3.s[3]                       \n"  // out r1:
-                                                                     // w15
-            "fmla v29.4s, v19.4s, v4.s[0]                       \n"  // out r1:
-                                                                     // w16
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"  // next r1
-                                                                     // v15: 4 6
-                                                                     // 8 10,
-                                                                     // v16:
-                                                                     // trash
-                                                                     // register
-
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"  // out r1:
-                                                                     // w18
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"  // out r1:
-                                                                     // w19
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2
-                                                                     // v18: 0 2
-                                                                     // 4 6,
-                                                                     // v19: 1 3
-                                                                     // 5 7
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2
-                                                                     // v16: 2 4
-                                                                     // 6 8,
-                                                                     // v11: 3 5
-                                                                     // 7 9
-
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"  // out r1:
-                                                                     // w20
-            "fmla v29.4s, v21.4s, v5.s[2]                       \n"  // out r1:
-                                                                     // w22
-            "fmla v27.4s, v24.4s, v5.s[1]                       \n"  // out r1:
-                                                                     // w21
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"  // out r1:
-                                                                     // w23
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"  // next r2
-                                                                     // v20: 4 6
-                                                                     // 8 10,
-                                                                     // v21:
-                                                                     // trash
-                                                                     // register
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3
-                                                                     // v23: 0 2
-                                                                     // 4 6,
-                                                                     // v24: 1 3
-                                                                     // 5 7
-
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"  // out r1:
-                                                                     // w24
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3
-                                                                     // v21: 2 4
-                                                                     // 6 8,
-                                                                     // v22: 3 5
-                                                                     // 7 9
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"  // next r3
-                                                                     // v25: 4 6
-                                                                     // 8 10,
-                                                                     // v26:
-                                                                     // trash
-                                                                     // register
-
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-            "fmax v27.4s, v27.4s, v31.4s                        \n"
-            "cmp %w[mid_cnt], #1                                \n"
-            "prfm pldl1keep, [%[din_ptr1]]                      \n"
-            "prfm pldl1keep, [%[din_ptr2]]                      \n"
-            "prfm pldl1keep, [%[din_ptr3]]                      \n"
-            "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"
-            "blt 2f                                             \n"
-
-            // mid loop
-            "1:                                                 \n"
-            "ld1 {v26.4s}, [%[vbias]]                           \n"
-            "mov v27.16b, v26.16b                               \n"
-            "mov v28.16b, v31.16b                               \n"
-            "mov v29.16b, v31.16b                               \n"
-
-            // out_r0 r0-r3
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"
-
-            "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"
-
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"
-
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"
-
-            "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr0]]                      \n"
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr4]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr4]]                      \n"
-
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"
-
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr5]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr5]]                      \n"
-
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"
-
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"
-            "fadd v28.4s, v26.4s, v28.4s                        \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr6]], %[s_16]       \n"
-            "mov v26.16b, v31.16b                               \n"
-            "prfm pldl1keep, [%[din_ptr6]]                      \n"
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"
-
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"
-
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr1]]                      \n"
-
-            "fmla v29.4s, v18.4s, v3.s[3]                       \n"
-            "fmla v27.4s, v19.4s, v4.s[0]                       \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"
-
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"
-
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"
-            "fmla v27.4s, v21.4s, v5.s[2]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"
-
-            "fmla v29.4s, v24.4s, v5.s[1]                       \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"
-            "prfm pldl1keep, [%[din_ptr2]]                      \n"
-
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"
-
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-            "fmax v26.4s, v26.4s, v31.4s                        \n"
-            "fmax v27.4s, v27.4s, v31.4s                        \n"
-
-            "prfm pldl1keep, [%[din_ptr3]]                      \n"
-            "st1 {v26.4s}, [%[dout_ptr0]], #16                  \n"
-            "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"
-            "subs %w[mid_cnt], %w[mid_cnt], #1                  \n"
-            "bne 1b                                             \n"
-
-            "2:                                                 \n"
-            "ld2 {v26.4s, v27.4s}, [%[mask]], %[s_8]            \n"
-            "ld2 {v28.4s, v29.4s}, [%[mask]], %[s_8]            \n"
-            "bif v8.16b, v31.16b, v26.16b                       \n"
-            "bif v9.16b, v31.16b, v27.16b                       \n"
-            "bif v6.16b, v31.16b, v28.16b                       \n"
-            "bif v7.16b, v31.16b, v29.16b                       \n"
-
-            "bif v13.16b, v31.16b, v26.16b                      \n"
-            "bif v14.16b, v31.16b, v27.16b                      \n"
-            "bif v11.16b, v31.16b, v28.16b                      \n"
-            "bif v12.16b, v31.16b, v29.16b                      \n"
-
-            "bif v18.16b, v31.16b, v26.16b                      \n"
-            "bif v19.16b, v31.16b, v27.16b                      \n"
-            "bif v16.16b, v31.16b, v28.16b                      \n"
-            "bif v17.16b, v31.16b, v29.16b                      \n"
-
-            "bif v23.16b, v31.16b, v26.16b                      \n"
-            "bif v24.16b, v31.16b, v27.16b                      \n"
-            "bif v21.16b, v31.16b, v28.16b                      \n"
-            "bif v22.16b, v31.16b, v29.16b                      \n"
-
-            "ld2 {v28.4s, v29.4s}, [%[mask]]                    \n"
-            "ld1 {v26.4s}, [%[vbias]]                           \n"
-            "mov v29.16b, v31.16b                               \n"
-
-            "bif v10.16b, v31.16b, v28.16b                      \n"
-            "bif v15.16b, v31.16b, v28.16b                      \n"
-
-            "mov v27.16b, v26.16b                               \n"
-
-            "bif v20.16b, v31.16b, v28.16b                      \n"
-            "bif v25.16b, v31.16b, v28.16b                      \n"
-            "mov v28.16b, v31.16b                               \n"
-
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"
-
-            "sub %[mask], %[mask], #16                          \n"
-            "ld2 {v6.4s, v7.4s}, [%[mask]], %[s_8]              \n"
-            "ld2 {v8.4s, v9.4s}, [%[mask]], %[s_8]              \n"
-            "ld2 {v10.4s, v11.4s}, [%[mask]]                    \n"
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr4]]                \n"
-
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"
-
-            "bif  v13.16b, v31.16b, v6.16b                      \n"
-            "bif  v14.16b, v31.16b, v7.16b                      \n"
-            "bif  v11.16b, v31.16b, v8.16b                      \n"
-            "bif  v12.16b, v31.16b, v9.16b                      \n"
-            "bif  v15.16b, v31.16b, v10.16b                     \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr5]]                \n"
-
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"
-
-            "bif  v18.16b, v31.16b, v6.16b                      \n"
-            "bif  v19.16b, v31.16b, v7.16b                      \n"
-            "bif  v16.16b, v31.16b, v8.16b                      \n"
-            "bif  v17.16b, v31.16b, v9.16b                      \n"
-            "bif  v20.16b, v31.16b, v10.16b                     \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"
-            "fadd v28.4s, v28.4s, v26.4s                        \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr6]]                \n"
-            "mov v26.16b, v31.16b                               \n"
-
-            "bif  v23.16b, v31.16b, v6.16b                      \n"
-            "bif  v24.16b, v31.16b, v7.16b                      \n"
-            "bif  v21.16b, v31.16b, v8.16b                      \n"
-            "bif  v22.16b, v31.16b, v9.16b                      \n"
-            "bif  v25.16b, v31.16b, v10.16b                     \n"
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"
-
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"
-            "fmla v29.4s, v18.4s, v3.s[3]                       \n"
-            "fmla v27.4s, v19.4s, v4.s[0]                       \n"
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"
-
-            "fmax v26.4s, v26.4s, v31.4s                        \n"
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"
-            "fmla v29.4s, v24.4s, v5.s[1]                       \n"
-
-            "st1 {v26.4s}, [%[out_buf0]]                        \n"
-            "fmla v27.4s, v21.4s, v5.s[2]                       \n"
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-
-            "fmax v27.4s, v27.4s, v31.4s                        \n"
-            "st1 {v27.4s}, [%[out_buf1]]                        \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [dout_ptr1] "+r"(dout_ptr1),
-              [mid_cnt] "+r"(loop),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [din_ptr6] "+r"(din_ptr6),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [out_buf1] "r"(out_buf1),
-              [s_8] "r"(s_8),
-              [s_16] "r"(s_16)
-            : "memory",
-              "cc",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25",
-              "v26",
-              "v27",
-              "v28",
-              "v29",
-              "v30",
-              "v31");
-
-        int remain_cnt = w_out - (mid_cnt + 1) * 4;
-        for (int i = 0; i < remain_cnt; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-          dout_ptr1[i] = out_buf1[i];
-        }
-        din0 = din4;
-        din1 = din5;
-        din2 = din6;
-        din3 = din6 + w_in;
-        din4 = din3 + w_in;
-        din5 = din4 + w_in;
-        din6 = din5 + w_in;
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
-      }
-    }
-  }
-}
-
-//! small depthwise, win < 9;
-void conv_depthwise_5x5s2p2_s(const float* din,
-                              float* dout,
-                              int num,
-                              int ch_out,
-                              int h_out,
-                              int w_out,
-                              int ch_in,
-                              int h_in,
-                              int w_in,
-                              const float* weights,
-                              const float* bias,
-                              bool flag_bias,
-                              bool flag_relu,
-                              ARMContext* ctx) {
-  CHECK_LT(w_in, 9) << "only support win < 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int mask_cnt = 12 - w_in - 2;
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
-          }
-        }
-
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        const int s_8 = 8;
-        //! in r0/r4, r1, r2, r3: x 0 2 4 -- v8   v13  v18  v23  v28
-        //! in r0/r4, r1, r2, r3: x 1 3 5 -- v9   v14  v19  v24  v29
-        //! in r0/r4, r1, r2, r3: 0 2 4 6 -- v6   v11  v16  v21  v26
-        //! in r0/r4, r1, r2, r3: 1 3 5 7 -- v7   v12  v17  v22  v27
-        //! in r0/r4, r1, r2, r3: 2 4 6 8 -- v10  v15  v20  v25  v30
-        //! out r0 -- v4
-        asm volatile(
-            "movi   v31.4s, #0x0\n"
-            "prfm pldl1keep, [%[din_ptr0]]  \n"
-            "prfm pldl1keep, [%[din_ptr1]]  \n"
-            "prfm pldl1keep, [%[din_ptr2]]  \n"
-            "prfm pldl1keep, [%[din_ptr3]]  \n"
-            "prfm pldl1keep, [%[din_ptr4]]  \n"
-            "prfm pldl1keep, [%[weights]]   \n"
-            "prfm pldl1keep, [%[mask]]      \n"
-
-            //! load mask
-            "ld2 {v0.4s, v1.4s}, [%[mask]], %[s_8]  \n"
-            "ld2 {v2.4s, v3.4s}, [%[mask]], %[s_8]  \n"
-            "ld2 {v4.4s, v5.4s}, [%[mask]]  \n"
-
-            //! load and extract input
-            "ld2 {v6.4s, v7.4s},   [%[din_ptr0]], #32  \n"
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32 \n"
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32 \n"
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32 \n"
-            "ld2 {v26.4s, v27.4s}, [%[din_ptr4]], #32 \n"
-
-            "ext v8.16b, v31.16b, v6.16b, #12  \n"
-            "ext v9.16b, v31.16b, v7.16b, #12  \n"
-            "ext v13.16b, v31.16b, v11.16b, #12  \n"
-            "ext v14.16b, v31.16b, v12.16b, #12  \n"
-
-            "ext v18.16b, v31.16b, v16.16b, #12  \n"
-            "ext v19.16b, v31.16b, v17.16b, #12  \n"
-            "ext v23.16b, v31.16b, v21.16b, #12  \n"
-            "ext v24.16b, v31.16b, v22.16b, #12  \n"
-            "ext v28.16b, v31.16b, v26.16b, #12  \n"
-            "ext v29.16b, v31.16b, v27.16b, #12  \n"
-
-            "ext v10.16b, v6.16b,  v31.16b, #4  \n"
-            "ext v15.16b, v11.16b, v31.16b, #4  \n"
-            "ext v20.16b, v16.16b, v31.16b, #4  \n"
-            "ext v25.16b, v21.16b, v31.16b, #4  \n"
-            "ext v30.16b, v26.16b, v31.16b, #4  \n"
-
-            "bif v8.16b, v31.16b, v0.16b  \n"
-            "bif v9.16b, v31.16b, v1.16b  \n"
-            "bif v6.16b, v31.16b, v2.16b  \n"
-            "bif v7.16b, v31.16b, v3.16b  \n"
-
-            "bif v13.16b, v31.16b, v0.16b  \n"
-            "bif v14.16b, v31.16b, v1.16b  \n"
-            "bif v11.16b, v31.16b, v2.16b  \n"
-            "bif v12.16b, v31.16b, v3.16b  \n"
-
-            "bif v18.16b, v31.16b, v0.16b  \n"
-            "bif v19.16b, v31.16b, v1.16b  \n"
-            "bif v16.16b, v31.16b, v2.16b  \n"
-            "bif v17.16b, v31.16b, v3.16b  \n"
-
-            "ld1 {v10.s}[3], [%[din_ptr0]]  \n"
-            "ld1 {v15.s}[3], [%[din_ptr1]]  \n"
-            "ld1 {v20.s}[3], [%[din_ptr2]]  \n"
-            "ld1 {v25.s}[3], [%[din_ptr3]]  \n"
-            "ld1 {v30.s}[3], [%[din_ptr4]]  \n"
-
-            "bif v23.16b, v31.16b, v0.16b  \n"
-            "bif v24.16b, v31.16b, v1.16b  \n"
-            "bif v21.16b, v31.16b, v2.16b  \n"
-            "bif v22.16b, v31.16b, v3.16b  \n"
-
-            "bif v28.16b, v31.16b, v0.16b  \n"
-            "bif v29.16b, v31.16b, v1.16b  \n"
-            "bif v26.16b, v31.16b, v2.16b  \n"
-            "bif v27.16b, v31.16b, v3.16b  \n"
-
-            "bif v10.16b, v31.16b, v4.16b  \n"
-            "bif v15.16b, v31.16b, v4.16b  \n"
-            "bif v20.16b, v31.16b, v4.16b  \n"
-            "bif v25.16b, v31.16b, v4.16b  \n"
-            "bif v30.16b, v31.16b, v4.16b  \n"
-
-            "ld1 {v4.4s}, [%[vbias]]  \n"
-            "mov v5.16b, v31.16b  \n"
-
-            "ld1 {v0.4s, v1.4s}, [%[weights]], #32  \n"  // load weights 0-7
-            "ld1 {v2.4s, v3.4s}, [%[weights]], #32  \n"  // load weights 8-15
-
-            //! compute
-            "fmla v4.4s, v8.4s, v0.s[0]  \n"  // out r0: w0
-            "fmla v5.4s, v9.4s, v0.s[1]  \n"  // out r0: w1
-            "fmla v4.4s, v6.4s, v0.s[2]  \n"  // out r0: w2
-            "fmla v5.4s, v7.4s, v0.s[3]  \n"  // out r0: w3
-
-            "fmla v4.4s, v10.4s, v1.s[0]  \n"  // out r0: w4
-            "fmla v5.4s, v13.4s, v1.s[1]  \n"  // out r0: w5
-            "fmla v4.4s, v14.4s, v1.s[2]  \n"  // out r0: w6
-            "fmla v5.4s, v11.4s, v1.s[3]  \n"  // out r0: w7
-
-            "ld1 {v6.4s, v7.4s}, [%[weights]], #32  \n"  // load weights 16-23
-            "ld1 {v8.s}[0], [%[weights]]  \n"            // load weights 24
-
-            "fmla v4.4s, v12.4s, v2.s[0]  \n"  // out r0: w8
-            "fmla v5.4s, v15.4s, v2.s[1]  \n"  // out r0: w9
-            "fmla v4.4s, v18.4s, v2.s[2]  \n"  // out r0: w10
-            "fmla v5.4s, v19.4s, v2.s[3]  \n"  // out r0: w11
-
-            "fmla v4.4s, v16.4s, v3.s[0]  \n"  // out r0: w12
-            "fmla v5.4s, v17.4s, v3.s[1]  \n"  // out r0: w13
-            "fmla v4.4s, v20.4s, v3.s[2]  \n"  // out r0: w14
-            "fmla v5.4s, v23.4s, v3.s[3]  \n"  // out r0: w15
-
-            "fmla v4.4s, v24.4s, v6.s[0]  \n"  // out r0: w16
-            "fmla v5.4s, v21.4s, v6.s[1]  \n"  // out r0: w17
-            "fmla v4.4s, v22.4s, v6.s[2]  \n"  // out r0: w18
-            "fmla v5.4s, v25.4s, v6.s[3]  \n"  // out r0: w19
-
-            "fmla v4.4s, v28.4s, v7.s[0]  \n"  // out r0: w20
-            "fmla v5.4s, v29.4s, v7.s[1]  \n"  // out r0: w21
-            "fmla v4.4s, v26.4s, v7.s[2]  \n"  // out r0: w22
-            "fmla v5.4s, v27.4s, v7.s[3]  \n"  // out r0: w23
-            "fmla v4.4s, v30.4s, v8.s[0] \n"   // out r0: w24
-
-            "fadd v4.4s, v4.4s, v5.4s  \n"  // add out to v4
-            "st1 {v4.4s}, [%[out_buf0]]  \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [out_buf1] "r"(out_buf1),
-              [s_8] "r"(s_8)
-            : "memory",
-              "cc",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25",
-              "v26",
-              "v27",
-              "v28",
-              "v29",
-              "v30",
-              "v31");
-        for (int i = 0; i < w_out; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
-      }
-    }
-  }
-}
-
-//! small depthwise, win < 9;
-void conv_depthwise_5x5s2p2_relu_s(const float* din,
-                                   float* dout,
-                                   int num,
-                                   int ch_out,
-                                   int h_out,
-                                   int w_out,
-                                   int ch_in,
-                                   int h_in,
-                                   int w_in,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   ARMContext* ctx) {
-  CHECK_LT(w_in, 9) << "only support win < 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int mask_cnt = 12 - w_in - 2;
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        const int s_8 = 8;
-        //! in r0/r4, r1, r2, r3: x 0 2 4 -- v8   v13  v18  v23  v28
-        //! in r0/r4, r1, r2, r3: x 1 3 5 -- v9   v14  v19  v24  v29
-        //! in r0/r4, r1, r2, r3: 0 2 4 6 -- v6   v11  v16  v21  v26
-        //! in r0/r4, r1, r2, r3: 1 3 5 7 -- v7   v12  v17  v22  v27
-        //! in r0/r4, r1, r2, r3: 2 4 6 8 -- v10  v15  v20  v25  v30
-        //! out r0 -- v4
-        asm volatile(
-            "movi   v31.4s, #0x0\n"
-            "prfm pldl1keep, [%[din_ptr0]]  \n"
-            "prfm pldl1keep, [%[din_ptr1]]  \n"
-            "prfm pldl1keep, [%[din_ptr2]]  \n"
-            "prfm pldl1keep, [%[din_ptr3]]  \n"
-            "prfm pldl1keep, [%[din_ptr4]]  \n"
-            "prfm pldl1keep, [%[weights]]   \n"
-            "prfm pldl1keep, [%[mask]]      \n"
-
-            //! load mask
-            "ld2 {v0.4s, v1.4s}, [%[mask]], %[s_8]  \n"
-            "ld2 {v2.4s, v3.4s}, [%[mask]], %[s_8]  \n"
-            "ld2 {v4.4s, v5.4s}, [%[mask]]  \n"
-
-            //! load and extract input
-            "ld2 {v6.4s, v7.4s},   [%[din_ptr0]], #32  \n"
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32 \n"
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32 \n"
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32 \n"
-            "ld2 {v26.4s, v27.4s}, [%[din_ptr4]], #32 \n"
-
-            "ext v8.16b, v31.16b, v6.16b, #12  \n"
-            "ext v9.16b, v31.16b, v7.16b, #12  \n"
-            "ext v13.16b, v31.16b, v11.16b, #12  \n"
-            "ext v14.16b, v31.16b, v12.16b, #12  \n"
-
-            "ext v18.16b, v31.16b, v16.16b, #12  \n"
-            "ext v19.16b, v31.16b, v17.16b, #12  \n"
-            "ext v23.16b, v31.16b, v21.16b, #12  \n"
-            "ext v24.16b, v31.16b, v22.16b, #12  \n"
-            "ext v28.16b, v31.16b, v26.16b, #12  \n"
-            "ext v29.16b, v31.16b, v27.16b, #12  \n"
-
-            "ext v10.16b, v6.16b,  v31.16b, #4  \n"
-            "ext v15.16b, v11.16b, v31.16b, #4  \n"
-            "ext v20.16b, v16.16b, v31.16b, #4  \n"
-            "ext v25.16b, v21.16b, v31.16b, #4  \n"
-            "ext v30.16b, v26.16b, v31.16b, #4  \n"
-
-            "bif v8.16b, v31.16b, v0.16b  \n"
-            "bif v9.16b, v31.16b, v1.16b  \n"
-            "bif v6.16b, v31.16b, v2.16b  \n"
-            "bif v7.16b, v31.16b, v3.16b  \n"
-
-            "bif v13.16b, v31.16b, v0.16b  \n"
-            "bif v14.16b, v31.16b, v1.16b  \n"
-            "bif v11.16b, v31.16b, v2.16b  \n"
-            "bif v12.16b, v31.16b, v3.16b  \n"
-
-            "bif v18.16b, v31.16b, v0.16b  \n"
-            "bif v19.16b, v31.16b, v1.16b  \n"
-            "bif v16.16b, v31.16b, v2.16b  \n"
-            "bif v17.16b, v31.16b, v3.16b  \n"
-
-            "ld1 {v10.s}[3], [%[din_ptr0]]  \n"
-            "ld1 {v15.s}[3], [%[din_ptr1]]  \n"
-            "ld1 {v20.s}[3], [%[din_ptr2]]  \n"
-            "ld1 {v25.s}[3], [%[din_ptr3]]  \n"
-            "ld1 {v30.s}[3], [%[din_ptr4]]  \n"
-
-            "bif v23.16b, v31.16b, v0.16b  \n"
-            "bif v24.16b, v31.16b, v1.16b  \n"
-            "bif v21.16b, v31.16b, v2.16b  \n"
-            "bif v22.16b, v31.16b, v3.16b  \n"
-
-            "bif v28.16b, v31.16b, v0.16b  \n"
-            "bif v29.16b, v31.16b, v1.16b  \n"
-            "bif v26.16b, v31.16b, v2.16b  \n"
-            "bif v27.16b, v31.16b, v3.16b  \n"
-
-            "bif v10.16b, v31.16b, v4.16b  \n"
-            "bif v15.16b, v31.16b, v4.16b  \n"
-            "bif v20.16b, v31.16b, v4.16b  \n"
-            "bif v25.16b, v31.16b, v4.16b  \n"
-            "bif v30.16b, v31.16b, v4.16b  \n"
-
-            "ld1 {v4.4s}, [%[vbias]]  \n"
-            "mov v5.16b, v31.16b  \n"
-
-            "ld1 {v0.4s, v1.4s}, [%[weights]], #32  \n"  // load weights 0-7
-            "ld1 {v2.4s, v3.4s}, [%[weights]], #32  \n"  // load weights 8-15
-
-            //! compute
-            "fmla v4.4s, v8.4s, v0.s[0]  \n"  // out r0: w0
-            "fmla v5.4s, v9.4s, v0.s[1]  \n"  // out r0: w1
-            "fmla v4.4s, v6.4s, v0.s[2]  \n"  // out r0: w2
-            "fmla v5.4s, v7.4s, v0.s[3]  \n"  // out r0: w3
-
-            "fmla v4.4s, v10.4s, v1.s[0]  \n"  // out r0: w4
-            "fmla v5.4s, v13.4s, v1.s[1]  \n"  // out r0: w5
-            "fmla v4.4s, v14.4s, v1.s[2]  \n"  // out r0: w6
-            "fmla v5.4s, v11.4s, v1.s[3]  \n"  // out r0: w7
-
-            "ld1 {v6.4s, v7.4s}, [%[weights]], #32  \n"  // load weights 16-23
-            "ld1 {v8.s}[0], [%[weights]]  \n"            // load weights 24
-
-            "fmla v4.4s, v12.4s, v2.s[0]  \n"  // out r0: w8
-            "fmla v5.4s, v15.4s, v2.s[1]  \n"  // out r0: w9
-            "fmla v4.4s, v18.4s, v2.s[2]  \n"  // out r0: w10
-            "fmla v5.4s, v19.4s, v2.s[3]  \n"  // out r0: w11
-
-            "fmla v4.4s, v16.4s, v3.s[0]  \n"  // out r0: w12
-            "fmla v5.4s, v17.4s, v3.s[1]  \n"  // out r0: w13
-            "fmla v4.4s, v20.4s, v3.s[2]  \n"  // out r0: w14
-            "fmla v5.4s, v23.4s, v3.s[3]  \n"  // out r0: w15
-
-            "fmla v4.4s, v24.4s, v6.s[0]  \n"  // out r0: w16
-            "fmla v5.4s, v21.4s, v6.s[1]  \n"  // out r0: w17
-            "fmla v4.4s, v22.4s, v6.s[2]  \n"  // out r0: w18
-            "fmla v5.4s, v25.4s, v6.s[3]  \n"  // out r0: w19
-
-            "fmla v4.4s, v28.4s, v7.s[0]  \n"  // out r0: w20
-            "fmla v5.4s, v29.4s, v7.s[1]  \n"  // out r0: w21
-            "fmla v4.4s, v26.4s, v7.s[2]  \n"  // out r0: w22
-            "fmla v5.4s, v27.4s, v7.s[3]  \n"  // out r0: w23
-            "fmla v4.4s, v30.4s, v8.s[0]  \n"  // out r0: w24
-
-            "fadd v4.4s, v4.4s, v5.4s     \n"  // add out to v4
-            "fmax v4.4s, v4.4s, v31.4s    \n"
-            "st1 {v4.4s}, [%[out_buf0]]   \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [out_buf1] "r"(out_buf1),
-              [s_8] "r"(s_8)
-            : "memory",
-              "cc",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25",
-              "v26",
-              "v27",
-              "v28",
-              "v29",
-              "v30",
-              "v31");
-        for (int i = 0; i < w_out; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
-      }
-    }
-  }
-}
-
+#ifdef __aarch64__
+          act_switch_5x5s2(inr0,
+                           inr1,
+                           inr2,
+                           inr3,
+                           inr4,
+                           outc0,
+                           outc1,
+                           outc2,
+                           outc3,
+                           w0,
+                           w1,
+                           w2,
+                           w3,
+                           w4,
+                           vbias,
+                           weight_c,
+                           bias_local,
+                           act_param);
 #else
-
-//! larger depthwise, win >= 9;
-void conv_depthwise_5x5s2p2(const float* din,
-                            float* dout,
-                            int num,
-                            int ch_out,
-                            int h_out,
-                            int w_out,
-                            int ch_in,
-                            int h_in,
-                            int w_in,
-                            const float* weights,
-                            const float* bias,
-                            bool flag_bias,
-                            bool flag_relu,
-                            ARMContext* ctx) {
-  // printf("invoke 5x5s2p2 armv7\n");
-  CHECK_GE(w_in, 9) << "only support win >= 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int cnt = (w_out_round - 4) / 4;
-  int mid_cnt = cnt - 1;
-  int right_start = cnt * 2 * 4 - 2;
-  int mask_cnt = 12 - (w_in - right_start);
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float* dout0 = dout_ch;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 4);
-      float32x4_t w2 = vld1q_f32(weights_c + 8);
-      float32x4_t w3 = vld1q_f32(weights_c + 12);
-      float32x4_t w4 = vld1q_f32(weights_c + 16);
-      float32x4_t w5 = vld1q_f32(weights_c + 20);
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c + 24;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        int loop = mid_cnt;
-        const int s_8 = 8;
-        const int s_16 = 16;
-
-        asm volatile(
-            "vmov.i32   q15, #0x0           \n"
-            "pld [%[din_ptr0]]              \n"
-            "pld [%[din_ptr1]]              \n"
-            "pld [%[din_ptr2]]              \n"
-            "pld [%[din_ptr3]]              \n"
-            "pld [%[din_ptr4]]              \n"
-            "pld [%[mask]]                  \n"
-
-            // left
-            "vld2.32 {d16-d19}, [%[din_ptr0]]!          \n"
-            "vld1.32 {d26-d29}, [%[vbias]]              \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
-            "sub %[din_ptr0], #8  \n"
-
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r1
-            "vmla.f32 q13, q8, %f[w1][1]                \n"
-            "vmla.f32 q14, q9, %e[w2][0]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
-            "sub %[din_ptr1], #8                        \n"
-
-            "vmla.f32 q13, q6, %e[w1][1]                \n"
-            "vmla.f32 q14, q7, %f[w1][0]                \n"
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r2
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
-            "sub %[din_ptr2], #8                        \n"
-
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r3
-            "vmla.f32 q13, q8, %e[w4][1]                \n"
-            "vmla.f32 q14, q9, %f[w4][0]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
-            "sub %[din_ptr3], #8                        \n"
-
-            "vmla.f32 q13, q6, %f[w3][1]                \n"
-            "vmla.f32 q14, q7, %e[w4][0]                \n"
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r4
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"
-            "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
-            "sub %[din_ptr4], #8                        \n"
-
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"
-
-            "vmov.32 q12, %q[w0]                        \n"
-            "vld1.32 {%e[w0][0]}, [%[weights]]          \n"
-            "vmla.f32 q13, q10, %e[w0][0]               \n"
-            "vadd.f32 q13, q13, q14                     \n"
-            "vmov.32 %q[w0], q12                        \n"
-            "cmp %[mid_cnt], #1                         \n"
-            "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"
-            "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
-            "pld [%[din_ptr0]]                          \n"
-            "blt 2f                                     \n"
-
-            // mid
-            "1:                                         \n"
-            "vld1.32 {d26-d27}, [%[vbias]]              \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vmla.f32 q13, q6,  %e[w0][0]               \n"
-            "vmla.f32 q14, q7,  %e[w0][1]               \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %f[w0][0]               \n"
-            "vmla.f32 q14, q9,  %f[w0][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr1]], %[s_16]  \n"
-
-            // r1
-            "vmla.f32 q13, q6,  %e[w1][1]               \n"
-            "vmla.f32 q14, q7,  %f[w1][0]               \n"
-            "pld [%[din_ptr1]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %f[w1][1]               \n"
-            "vmla.f32 q14, q9,  %e[w2][0]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr2]], %[s_16]  \n"
-
-            // r2
-            "vmla.f32 q13, q6,  %f[w2][0]               \n"
-            "vmla.f32 q14, q7,  %f[w2][1]               \n"
-            "pld [%[din_ptr2]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %e[w3][0]               \n"
-            "vmla.f32 q14, q9,  %e[w3][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr3]], %[s_16]  \n"
-
-            // r3
-            "vmla.f32 q13, q6,  %f[w3][1]               \n"
-            "vmla.f32 q14, q7,  %e[w4][0]               \n"
-            "pld [%[din_ptr3]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %e[w4][1]               \n"
-            "vmla.f32 q14, q9,  %f[w4][0]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr4]], %[s_16]  \n"
-
-            // r4
-            "vmla.f32 q13, q6,  %e[w5][0]               \n"
-            "vmla.f32 q14, q7,  %e[w5][1]               \n"
-            "pld [%[din_ptr4]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
-            "vld1.32 {%e[w0][0]}, [%[weights]]          \n"
-
-            "vmla.f32 q13, q8,  %f[w5][0]               \n"
-            "vmla.f32 q14, q9,  %f[w5][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w0][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"
-
-            "vmov.32 %q[w0], q12                        \n"
-            "vadd.f32 q13, q13, q14                     \n"
-            "subs %[mid_cnt], #1                        \n"
-            "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
-            "bne 1b                                     \n"
-
-            "2:                                         \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d26-d27}, [%[vbias]]              \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            // r1
-            "vld2.32 {d20-d23}, [%[din_ptr1]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w1][1]                \n"
-            "vmla.f32 q14, q7, %f[w1][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w1][1]                \n"
-            "vmla.f32 q14, q9, %e[w2][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            // r2
-            "vld2.32 {d20-d23}, [%[din_ptr2]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            // r3
-            "vld2.32 {d20-d23}, [%[din_ptr3]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w3][1]                \n"
-            "vmla.f32 q14, q7, %e[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w4][1]                \n"
-            "vmla.f32 q14, q9, %f[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            // r4
-            "vld2.32 {d20-d23}, [%[din_ptr4]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d12[0]}, [%[weights]]             \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, d12[0]                  \n"
-
-            "vadd.f32 q13, q13, q14                     \n"
-            "vst1.32 {d26-d27}, [%[out_buf0]]           \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [mid_cnt] "+r"(loop),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [w0] "w"(w0),
-              [w1] "w"(w1),
-              [w2] "w"(w2),
-              [w3] "w"(w3),
-              [w4] "w"(w4),
-              [w5] "w"(w5),
-              [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [s_8] "r"(s_8),
-              [s_16] "r"(s_16)
-            : "memory",
-              "cc",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        int remain_cnt = w_out - (mid_cnt + 1) * 4;
-        for (int i = 0; i < remain_cnt; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
-      }
-    }
-  }
-}
-
-//! larger depthwise, win >= 9;
-void conv_depthwise_5x5s2p2_relu(const float* din,
-                                 float* dout,
-                                 int num,
-                                 int ch_out,
-                                 int h_out,
-                                 int w_out,
-                                 int ch_in,
-                                 int h_in,
-                                 int w_in,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 ARMContext* ctx) {
-  // printf("invoke 5x5s2p2 armv7\n");
-  CHECK_GE(w_in, 9) << "only support win >= 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int cnt = (w_out_round - 4) / 4;
-  int mid_cnt = cnt - 1;
-  int right_start = cnt * 2 * 4 - 2;
-  int mask_cnt = 12 - (w_in - right_start);
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float* dout0 = dout_ch;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 4);
-      float32x4_t w2 = vld1q_f32(weights_c + 8);
-      float32x4_t w3 = vld1q_f32(weights_c + 12);
-      float32x4_t w4 = vld1q_f32(weights_c + 16);
-      float32x4_t w5 = vld1q_f32(weights_c + 20);
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c + 24;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        int loop = mid_cnt;
-        const int s_8 = 8;
-        const int s_16 = 16;
-
-        asm volatile(
-            "vmov.i32   q15, #0x0           \n"
-            "pld [%[din_ptr0]]              \n"
-            "pld [%[din_ptr1]]              \n"
-            "pld [%[din_ptr2]]              \n"
-            "pld [%[din_ptr3]]              \n"
-            "pld [%[din_ptr4]]              \n"
-            "pld [%[mask]]                  \n"
-
-            // left
-            "vld2.32 {d16-d19}, [%[din_ptr0]]!          \n"
-            "vld1.32 {d26-d29}, [%[vbias]]              \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
-            "sub %[din_ptr0], #8  \n"
-
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r1
-            "vmla.f32 q13, q8, %f[w1][1]                \n"
-            "vmla.f32 q14, q9, %e[w2][0]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
-            "sub %[din_ptr1], #8                        \n"
-
-            "vmla.f32 q13, q6, %e[w1][1]                \n"
-            "vmla.f32 q14, q7, %f[w1][0]                \n"
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r2
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
-            "sub %[din_ptr2], #8                        \n"
-
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r3
-            "vmla.f32 q13, q8, %e[w4][1]                \n"
-            "vmla.f32 q14, q9, %f[w4][0]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
-            "sub %[din_ptr3], #8                        \n"
-
-            "vmla.f32 q13, q6, %f[w3][1]                \n"
-            "vmla.f32 q14, q7, %e[w4][0]                \n"
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r4
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"
-            "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
-            "sub %[din_ptr4], #8                        \n"
-
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"
-
-            "vmov.32 q12, %q[w0]                        \n"
-            "vld1.32 {%e[w0][0]}, [%[weights]]          \n"
-            "vmla.f32 q13, q10, %e[w0][0]               \n"
-            "vadd.f32 q13, q13, q14                     \n"
-            "vmov.f32 %q[w0], q12                        \n"
-            "vmax.f32 q13, q13, q15                     \n"
-            "cmp %[mid_cnt], #1                         \n"
-            "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"
-            "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
-            "pld [%[din_ptr0]]                          \n"
-            "blt 2f                                     \n"
-
-            // mid
-            "1:                                         \n"
-            "vld1.32 {d26-d27}, [%[vbias]]              \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vmla.f32 q13, q6,  %e[w0][0]               \n"
-            "vmla.f32 q14, q7,  %e[w0][1]               \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %f[w0][0]               \n"
-            "vmla.f32 q14, q9,  %f[w0][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr1]], %[s_16]  \n"
-
-            // r1
-            "vmla.f32 q13, q6,  %e[w1][1]               \n"
-            "vmla.f32 q14, q7,  %f[w1][0]               \n"
-            "pld [%[din_ptr1]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %f[w1][1]               \n"
-            "vmla.f32 q14, q9,  %e[w2][0]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr2]], %[s_16]  \n"
-
-            // r2
-            "vmla.f32 q13, q6,  %f[w2][0]               \n"
-            "vmla.f32 q14, q7,  %f[w2][1]               \n"
-            "pld [%[din_ptr2]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %e[w3][0]               \n"
-            "vmla.f32 q14, q9,  %e[w3][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr3]], %[s_16]  \n"
-
-            // r3
-            "vmla.f32 q13, q6,  %f[w3][1]               \n"
-            "vmla.f32 q14, q7,  %e[w4][0]               \n"
-            "pld [%[din_ptr3]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %e[w4][1]               \n"
-            "vmla.f32 q14, q9,  %f[w4][0]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr4]], %[s_16]  \n"
-
-            // r4
-            "vmla.f32 q13, q6,  %e[w5][0]               \n"
-            "vmla.f32 q14, q7,  %e[w5][1]               \n"
-            "pld [%[din_ptr4]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
-            "vld1.32 {%e[w0][0]}, [%[weights]]          \n"
-
-            "vmla.f32 q13, q8,  %f[w5][0]               \n"
-            "vmla.f32 q14, q9,  %f[w5][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w0][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"
-
-            "vmov.32 %q[w0], q12                        \n"
-            "vadd.f32 q13, q13, q14                     \n"
-            "vmax.f32 q13, q13, q15                     \n"
-            "subs %[mid_cnt], #1                        \n"
-            "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
-            "bne 1b                                     \n"
-
-            "2:                                         \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d26-d27}, [%[vbias]]              \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            // r1
-            "vld2.32 {d20-d23}, [%[din_ptr1]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w1][1]                \n"
-            "vmla.f32 q14, q7, %f[w1][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w1][1]                \n"
-            "vmla.f32 q14, q9, %e[w2][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            // r2
-            "vld2.32 {d20-d23}, [%[din_ptr2]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            // r3
-            "vld2.32 {d20-d23}, [%[din_ptr3]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w3][1]                \n"
-            "vmla.f32 q14, q7, %e[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w4][1]                \n"
-            "vmla.f32 q14, q9, %f[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            // r4
-            "vld2.32 {d20-d23}, [%[din_ptr4]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d12[0]}, [%[weights]]             \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, d12[0]                  \n"
-
-            "vadd.f32 q13, q13, q14                     \n"
-            "vmax.f32 q13, q13, q15                     \n"
-            "vst1.32 {d26-d27}, [%[out_buf0]]           \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [mid_cnt] "+r"(loop),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [w0] "w"(w0),
-              [w1] "w"(w1),
-              [w2] "w"(w2),
-              [w3] "w"(w3),
-              [w4] "w"(w4),
-              [w5] "w"(w5),
-              [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [s_8] "r"(s_8),
-              [s_16] "r"(s_16)
-            : "memory",
-              "cc",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        int remain_cnt = w_out - (mid_cnt + 1) * 4;
-        for (int i = 0; i < remain_cnt; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
-      }
-    }
-  }
-}
-
-//! small depthwise, win < 9;
-void conv_depthwise_5x5s2p2_s(const float* din,
-                              float* dout,
-                              int num,
-                              int ch_out,
-                              int h_out,
-                              int w_out,
-                              int ch_in,
-                              int h_in,
-                              int w_in,
-                              const float* weights,
-                              const float* bias,
-                              bool flag_bias,
-                              bool flag_relu,
-                              ARMContext* ctx) {
-  CHECK_LT(w_in, 9) << "only support win < 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int mask_cnt = 12 - w_in - 2;
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 4);
-      float32x4_t w2 = vld1q_f32(weights_c + 8);
-      float32x4_t w3 = vld1q_f32(weights_c + 12);
-      float32x4_t w4 = vld1q_f32(weights_c + 16);
-      float32x4_t w5 = vld1q_f32(weights_c + 20);
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c + 24;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        const int s_8 = 8;
-
-        asm volatile(
-            "vmov.i32  q15, #0x0                 \n"
-            "pld [%[din_ptr0]]                   \n"
-            "pld [%[din_ptr1]]                   \n"
-            "pld [%[din_ptr2]]                   \n"
-            "pld [%[din_ptr3]]                   \n"
-            "pld [%[din_ptr4]]                   \n"
-            "vld1.32 {d26-d27}, [%[vbias]]       \n"
-            "vmov.32 q14, q15                    \n"
-            "vld2.32 {d16-d19}, [%[din_ptr0]]!   \n"
-
-            // r0
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            // r1
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q14, q6, %e[w1][1]                \n"
-            "vmla.f32 q13, q7, %f[w1][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q14, q8, %f[w1][1]                \n"
-            "vmla.f32 q13, q9, %e[w2][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q14, q10, %e[w2][1]               \n"
-
-            // r2
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            // r3
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q14, q6, %f[w3][1]                \n"
-            "vmla.f32 q13, q7, %e[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q14, q8, %e[w4][1]                \n"
-            "vmla.f32 q13, q9, %f[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q14, q10, %f[w4][1]               \n"
-
-            // r4
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d12[0]}, [%[weights]]             \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, d12[0]                  \n"
-
-            "vadd.f32 q13, q13, q14                     \n"
-            "vst1.32 {d26-d27}, [%[out_buf0]]           \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [s_8] "r"(s_8),
-              [w0] "w"(w0),
-              [w1] "w"(w1),
-              [w2] "w"(w2),
-              [w3] "w"(w3),
-              [w4] "w"(w4),
-              [w5] "w"(w5)
-            : "memory",
-              "cc",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        for (int i = 0; i < w_out; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
-      }
-    }
-  }
-}
-
-//! small depthwise, win < 9;
-void conv_depthwise_5x5s2p2_relu_s(const float* din,
-                                   float* dout,
-                                   int num,
-                                   int ch_out,
-                                   int h_out,
-                                   int w_out,
-                                   int ch_in,
-                                   int h_in,
-                                   int w_in,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   ARMContext* ctx) {
-  CHECK_LT(w_in, 9) << "only support win < 9\n";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int mask_cnt = 12 - w_in - 2;
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 4);
-      float32x4_t w2 = vld1q_f32(weights_c + 8);
-      float32x4_t w3 = vld1q_f32(weights_c + 12);
-      float32x4_t w4 = vld1q_f32(weights_c + 16);
-      float32x4_t w5 = vld1q_f32(weights_c + 20);
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
+          act_switch_5x5s2(inr0,
+                           inr1,
+                           inr2,
+                           inr3,
+                           inr4,
+                           outc0,
+                           outc1,
+                           outc2,
+                           outc3,
+                           vzero,
+                           vzero,
+                           vzero,
+                           vzero,
+                           vzero,
+                           vzero,
+                           weight_c,
+                           bias_local,
+                           act_param);
+#endif
+          if (flag_mask) {
+            for (int i = 0; i < remain; ++i) {
+              c0[i] = pre_out[i];
+              c1[i] = pre_out[i + 4];
+              c2[i] = pre_out[i + 8];
+              c3[i] = pre_out[i + 12];
+            }
           }
+          inr0 += 32;
+          inr1 += 32;
+          inr2 += 32;
+          inr3 += 32;
+          inr4 += 32;
+          outc0 += 4;
+          outc1 += 4;
+          outc2 += 4;
+          outc3 += 4;
         }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c + 24;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        const int s_8 = 8;
-
-        asm volatile(
-            "vmov.i32  q15, #0x0                \n"
-            "pld [%[din_ptr0]]                  \n"
-            "pld [%[din_ptr1]]                  \n"
-            "pld [%[din_ptr2]]                  \n"
-            "pld [%[din_ptr3]]                  \n"
-            "pld [%[din_ptr4]]                  \n"
-            "vld1.32 {d26-d27}, [%[vbias]]      \n"
-            "vmov.32 q14, q15                   \n"
-            "vld2.32 {d16-d19}, [%[din_ptr0]]!  \n"
-
-            // r0
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            // r1
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q14, q6, %e[w1][1]                \n"
-            "vmla.f32 q13, q7, %f[w1][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q14, q8, %f[w1][1]                \n"
-            "vmla.f32 q13, q9, %e[w2][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q14, q10, %e[w2][1]               \n"
-
-            // r2
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            // r3
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q14, q6, %f[w3][1]                \n"
-            "vmla.f32 q13, q7, %e[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q14, q8, %e[w4][1]                \n"
-            "vmla.f32 q13, q9, %f[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q14, q10, %f[w4][1]               \n"
-
-            // r4
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d12[0]}, [%[weights]]             \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, d12[0]                  \n"
-
-            "vadd.f32 q13, q13, q14                     \n"
-            "vmax.f32 q13, q13, q15                     \n"
-            "vst1.32 {d26-d27}, [%[out_buf0]]           \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [s_8] "r"(s_8),
-              [w0] "w"(w0),
-              [w1] "w"(w1),
-              [w2] "w"(w2),
-              [w3] "w"(w3),
-              [w4] "w"(w4),
-              [w5] "w"(w5)
-            : "memory",
-              "cc",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        for (int i = 0; i < w_out; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
       }
     }
   }
 }
-#endif  // __aarch64__
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c778896550de73f888979c8337731a0b9967b5dd
--- /dev/null
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
@@ -0,0 +1,795 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_depthwise.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+#define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b))
+
+template <typename Dtype>
+void conv_depthwise_5x5s2_int8(Dtype* dout,
+                               const int8_t* din,
+                               const int8_t* weights,
+                               const float* scale,
+                               const float* bias,
+                               bool flag_bias,
+                               bool flag_relu,
+                               int num,
+                               int chin,
+                               int hin,
+                               int win,
+                               int hout,
+                               int wout,
+                               int padw,
+                               int padh,
+                               ARMContext* ctx) {
+  const int threads = ctx->threads();
+  int llc_size = ctx->llc_size() / 4;
+
+  const int hout_c_block = 8;
+  const int hout_r_kernel = 1;
+  const int wout_block = 4;
+  const int wout_round = ((wout + wout_block - 1) / wout_block) * wout_block;
+  const int win_round = wout_round * 2 + 3;
+
+  //! get h block
+  //! llc_size = threads * win_round * hout_c_block * hin_r_block *
+  //! sizeof(int8_t)
+  //! + wout_round * hout_c_block * hout_r_block * threads * sizeof(int32_t)
+  //! win_round = wout_round * 2 + 3
+  //! hin_r_block = hout_r_block * 2 + 3
+  int hout_r_block = (llc_size - 3 * win_round * hout_c_block * threads) /
+                     (2 * win_round * hout_c_block * threads +
+                      hout_c_block * wout_round * threads * 4);
+  hout_r_block = hout_r_block > hout ? hout : hout_r_block;
+  hout_r_block =
+      ((hout_r_block + hout_r_kernel - 1) / hout_r_kernel) * hout_r_kernel;
+  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
+
+  const int hin_r_block = hout_r_block * 2 + 3;
+
+  auto tmp_work_space = ctx->workspace_data<int8_t>();
+  int8_t ptr_zero[win_round];  // NOLINT
+  memset(ptr_zero, 0, sizeof(int8_t) * win_round);
+  Dtype ptr_write[wout_round];  // NOLINT
+
+  int in_len = win_round * hout_c_block;
+  int pre_in_size = hin_r_block * in_len;
+  pre_in_size = ROUNDUP(pre_in_size, 4);
+  int pre_out_size = hout_c_block * hout_r_block * wout_round;
+
+  int8_t* tmp_din = tmp_work_space;
+
+  int size_in_channel = win * hin;
+  int size_out_channel = wout * hout;
+  int w_stride = 25;  // kernel_w * kernel_h;
+
+  int ws = -padw;
+  int we = ws + win_round;
+  int w_loop = wout_round / 4;
+  int chout = chin;
+
+  int out_row_stride = hout_c_block * wout_round;
+  for (int n = 0; n < num; ++n) {
+    const int8_t* din_batch = din + n * chin * size_in_channel;
+    int8_t* dout_batch = reinterpret_cast<int8_t*>(dout) +
+                         n * chout * size_out_channel * sizeof(Dtype);
+    for (int h = 0; h < hout; h += hout_r_block) {
+      int h_kernel = hout_r_block;
+      if (h + hout_r_block > hout) {
+        h_kernel = hout - h;
+      }
+      int hs = h - padh;
+      int he = hs + h_kernel * 2 + 3;
+
+#pragma omp parallel for num_threads(threads)
+      for (int c = 0; c < chout; c += hout_c_block) {
+#ifdef ARM_WITH_OMP
+        int8_t* pre_din =
+            tmp_din + omp_get_thread_num() * (pre_in_size + pre_out_size * 4);
+        int32_t* pre_out = reinterpret_cast<int*>(pre_din + pre_in_size);
+#else
+        int32_t* pre_out = reinterpret_cast<int32_t*>(tmp_din + pre_in_size);
+        auto pre_din = tmp_din;
+#endif
+        prepack_input_nxwc8_int8_dw(
+            din_batch, pre_din, c, hs, he, ws, we, chin, win, hin);
+
+        const int8_t* block_inr0 = pre_din;
+        const int8_t* block_inr1 = block_inr0 + in_len;
+        const int8_t* block_inr2 = block_inr1 + in_len;
+        const int8_t* block_inr3 = block_inr2 + in_len;
+        const int8_t* block_inr4 = block_inr3 + in_len;
+
+        const int8_t* weight_c = weights + c * w_stride;
+        float bias_local[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+        if (flag_bias) {
+          bias_local[0] = bias[c];
+          bias_local[1] = bias[c + 1];
+          bias_local[2] = bias[c + 2];
+          bias_local[3] = bias[c + 3];
+          bias_local[4] = bias[c + 4];
+          bias_local[5] = bias[c + 5];
+          bias_local[6] = bias[c + 6];
+          bias_local[7] = bias[c + 7];
+        }
+        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
+          int cnt = w_loop;
+          const int8_t* inr0 = block_inr0;
+          const int8_t* inr1 = block_inr1;
+          const int8_t* inr2 = block_inr2;
+          const int8_t* inr3 = block_inr3;
+          const int8_t* inr4 = block_inr4;
+
+          int32_t* ptr_out0 = pre_out + hk * out_row_stride;
+// clang-format off
+#ifdef __aarch64__
+          auto wptr = weight_c;
+          asm volatile(
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r0]], #32\n"   /* load r0 0-3 */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r0]], #32\n"   /* load r0 4-7 */
+              "ld1  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[wc]], #32\n" /* load wc 0-3 */
+              "1:\n"
+              /* in r0 */
+              "smull  v20.8h, v0.8b,  v12.8b\n" /* w0, int16, out0 */
+              "smull  v21.8h, v2.8b,  v12.8b\n" /* w0, int16, out1 */
+              "smull  v22.8h, v4.8b,  v12.8b\n" /* w0, int16, out2 */
+              "smull  v23.8h, v6.8b,  v12.8b\n" /* w0, int16, out3 */
+              "ld1  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[r0]]\n" /* load r0 8-11 */
+              "smlal  v20.8h, v1.8b,  v13.8b\n" /* w1, int16, out0 */
+              "smlal  v21.8h, v3.8b,  v13.8b\n" /* w1, int16, out1 */
+              "smlal  v22.8h, v5.8b,  v13.8b\n" /* w1, int16, out2 */
+              "smlal  v23.8h, v7.8b,  v13.8b\n" /* w1, int16, out3 */
+              "sxtl   v24.4s, v20.4h\n" /* mov to out0 low */
+              "sxtl2  v25.4s, v20.8h\n" /* mov to out0 hig */
+              "sxtl   v26.4s, v21.4h\n" /* mov to out1 low */
+              "sxtl2  v27.4s, v21.8h\n" /* mov to out1 hig */
+              "sxtl   v28.4s, v22.4h\n" /* mov to out2 low */
+              "sxtl2  v29.4s, v22.8h\n" /* mov to out2 hig */
+              "sxtl   v30.4s, v23.4h\n" /* mov to out3 low */
+              "sxtl2  v31.4s, v23.8h\n" /* mov to out3 hig */
+              "ld1  {v16.8b, v17.8b, v18.8b, v19.8b}, [%[wc]], #32\n" /* load wc 4-7 */
+
+              "smull  v20.8h, v2.8b,  v14.8b\n" /* w2, int16, out0 */
+              "smull  v21.8h, v4.8b,  v14.8b\n" /* w2, int16, out1 */
+              "smull  v22.8h, v6.8b,  v14.8b\n" /* w2, int16, out2 */
+              "smull  v23.8h, v8.8b,  v14.8b\n" /* w2, int16, out3 */
+              "smlal  v20.8h, v3.8b,  v15.8b\n" /* w3, int16, out0 */
+              "smlal  v21.8h, v5.8b,  v15.8b\n" /* w3, int16, out1 */
+              "smlal  v22.8h, v7.8b,  v15.8b\n" /* w3, int16, out2 */
+              "smlal  v23.8h, v9.8b,  v15.8b\n" /* w3, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r1]], #32\n" /* load r1 0-3 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v4.8b,  v16.8b\n" /* w4, int16, out0 */
+              "smull  v21.8h, v6.8b,  v16.8b\n" /* w4, int16, out1 */
+              "smull  v22.8h, v8.8b,  v16.8b\n" /* w4, int16, out2 */
+              "smull  v23.8h, v10.8b, v16.8b\n" /* w4, int16, out3 */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r1]], #32\n" /* load r1 4-7 */
+              /* in r1 */
+              "smlal  v20.8h, v0.8b,  v17.8b\n" /* w5, int16, out0 */
+              "smlal  v21.8h, v2.8b,  v17.8b\n" /* w5, int16, out1 */
+              "smlal  v22.8h, v4.8b,  v17.8b\n" /* w5, int16, out2 */
+              "smlal  v23.8h, v6.8b,  v17.8b\n" /* w5, int16, out3 */
+              "ld1  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[r1]]\n" /* load r1 8-11 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v1.8b,  v18.8b\n" /* w6, int16, out0 */
+              "smull  v21.8h, v3.8b,  v18.8b\n" /* w6, int16, out1 */
+              "smull  v22.8h, v5.8b,  v18.8b\n" /* w6, int16, out2 */
+              "smull  v23.8h, v7.8b,  v18.8b\n" /* w6, int16, out3 */
+              "ld1  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[wc]], #32\n" /* load wc 8-11 */
+              "smlal  v20.8h, v2.8b,  v19.8b\n" /* w7, int16, out0 */
+              "smlal  v21.8h, v4.8b,  v19.8b\n" /* w7, int16, out1 */
+              "smlal  v22.8h, v6.8b,  v19.8b\n" /* w7, int16, out2 */
+              "smlal  v23.8h, v8.8b,  v19.8b\n" /* w7, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v16.8b, v17.8b, v18.8b, v19.8b}, [%[wc]], #32\n" /* load wc 12-15 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v3.8b,  v12.8b\n" /* w8, int16, out0 */
+              "smull  v21.8h, v5.8b,  v12.8b\n" /* w8, int16, out1 */
+              "smull  v22.8h, v7.8b,  v12.8b\n" /* w8, int16, out2 */
+              "smull  v23.8h, v9.8b,  v12.8b\n" /* w8, int16, out3 */
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r2]], #32\n" /* load r2 0-3 */
+              "smlal  v20.8h, v4.8b,  v13.8b\n" /* w9, int16, out0 */
+              "smlal  v21.8h, v6.8b,  v13.8b\n" /* w9, int16, out1 */
+              "smlal  v22.8h, v8.8b,  v13.8b\n" /* w9, int16, out2 */
+              "smlal  v23.8h, v10.8b, v13.8b\n" /* w9, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r2]], #32\n" /* load r2 4-7 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+               /* in r2 */
+              "smull  v20.8h, v0.8b,  v14.8b\n" /* w10, int16, out0 */
+              "smull  v21.8h, v2.8b,  v14.8b\n" /* w10, int16, out1 */
+              "smull  v22.8h, v4.8b,  v14.8b\n" /* w10, int16, out2 */
+              "smull  v23.8h, v6.8b,  v14.8b\n" /* w10, int16, out3 */
+              "ld1  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[r2]]\n" /* load r2 8-11 */
+              "smlal  v20.8h, v1.8b,  v15.8b\n" /* w11, int16, out0 */
+              "smlal  v21.8h, v3.8b,  v15.8b\n" /* w11, int16, out1 */
+              "smlal  v22.8h, v5.8b,  v15.8b\n" /* w11, int16, out2 */
+              "smlal  v23.8h, v7.8b,  v15.8b\n" /* w11, int16, out3 */
+
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[wc]], #32\n" /* load wc 16-19 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v2.8b,  v16.8b\n" /* w12, int16, out0 */
+              "smull  v21.8h, v4.8b,  v16.8b\n" /* w12, int16, out1 */
+              "smull  v22.8h, v6.8b,  v16.8b\n" /* w12, int16, out2 */
+              "smull  v23.8h, v8.8b,  v16.8b\n" /* w12, int16, out3 */
+              "smlal  v20.8h, v3.8b,  v17.8b\n" /* w13, int16, out0 */
+              "smlal  v21.8h, v5.8b,  v17.8b\n" /* w13, int16, out1 */
+              "smlal  v22.8h, v7.8b,  v17.8b\n" /* w13, int16, out2 */
+              "smlal  v23.8h, v9.8b,  v17.8b\n" /* w13, int16, out3 */
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r3]], #32\n" /* load r3 0-3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+              "smull  v20.8h, v4.8b,  v18.8b\n" /* w14, int16, out0 */
+              "smull  v21.8h, v6.8b,  v18.8b\n" /* w14, int16, out1 */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r3]], #32\n" /* load r3 4-7 */
+              "smull  v22.8h, v8.8b,  v18.8b\n" /* w14, int16, out2 */
+              "smull  v23.8h, v10.8b, v18.8b\n" /* w14, int16, out3 */
+              /* in r3 */
+              "smlal  v20.8h, v0.8b,  v19.8b\n" /* w15, int16, out0 */
+              "smlal  v21.8h, v2.8b,  v19.8b\n" /* w15, int16, out1 */
+              "smlal  v22.8h, v4.8b,  v19.8b\n" /* w15, int16, out2 */
+              "smlal  v23.8h, v6.8b,  v19.8b\n" /* w15, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[r3]]\n" /* load r3 8-11 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v1.8b,  v12.8b\n" /* w16, int16, out0 */
+              "smull  v21.8h, v3.8b,  v12.8b\n" /* w16, int16, out1 */
+              "smull  v22.8h, v5.8b,  v12.8b\n" /* w16, int16, out2 */
+              "smull  v23.8h, v7.8b,  v12.8b\n" /* w16, int16, out3 */
+              "ld1  {v16.8b, v17.8b, v18.8b, v19.8b}, [%[wc]], #32\n" /* load wc 20-23 */
+              "smlal  v20.8h, v2.8b,  v13.8b\n" /* w17, int16, out0 */
+              "smlal  v21.8h, v4.8b,  v13.8b\n" /* w17, int16, out1 */
+              "smlal  v22.8h, v6.8b,  v13.8b\n" /* w17, int16, out2 */
+              "smlal  v23.8h, v8.8b,  v13.8b\n" /* w17, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v3.8b,  v14.8b\n" /* w18, int16, out0 */
+              "smull  v21.8h, v5.8b,  v14.8b\n" /* w18, int16, out1 */
+              "smull  v22.8h, v7.8b,  v14.8b\n" /* w18, int16, out2 */
+              "smull  v23.8h, v9.8b,  v14.8b\n" /* w18, int16, out3 */
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r4]], #32\n" /* load r4 0-3 */
+              "smlal  v20.8h, v4.8b,  v15.8b\n" /* w19, int16, out0 */
+              "smlal  v21.8h, v6.8b,  v15.8b\n" /* w19, int16, out1 */
+              "smlal  v22.8h, v8.8b,  v15.8b\n" /* w19, int16, out2 */
+              "smlal  v23.8h, v10.8b, v15.8b\n" /* w19, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r4]], #32\n" /* load r4 4-7 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              /* in r4 */
+              "smull  v20.8h, v0.8b,  v16.8b\n" /* w20, int16, out0 */
+              "smull  v21.8h, v2.8b,  v16.8b\n" /* w20, int16, out1 */
+              "smull  v22.8h, v4.8b,  v16.8b\n" /* w20, int16, out2 */
+              "smull  v23.8h, v6.8b,  v16.8b\n" /* w20, int16, out3 */
+              "ld1  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[r4]]\n" /* load r4 8-11 */
+              "smlal  v20.8h, v1.8b,  v17.8b\n" /* w21, int16, out0 */
+              "smlal  v21.8h, v3.8b,  v17.8b\n" /* w21, int16, out1 */
+              "smlal  v22.8h, v5.8b,  v17.8b\n" /* w21, int16, out2 */
+              "smlal  v23.8h, v7.8b,  v17.8b\n" /* w21, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+              "ld1  {v16.8b}, [%[wc]], #8\n" /* load wc 24 */
+              "smull  v20.8h, v2.8b,  v18.8b\n" /* w22, int16, out0 */
+              "smull  v21.8h, v4.8b,  v18.8b\n" /* w22, int16, out1 */
+              "smull  v22.8h, v6.8b,  v18.8b\n" /* w22, int16, out2 */
+              "smull  v23.8h, v8.8b,  v18.8b\n" /* w22, int16, out3 */
+              "sub    %[wc], %[wc], #200 \n"
+              "smlal  v20.8h, v3.8b,  v19.8b\n" /* w23, int16, out0 */
+              "smlal  v21.8h, v5.8b,  v19.8b\n" /* w23, int16, out1 */
+              "smlal  v22.8h, v7.8b,  v19.8b\n" /* w23, int16, out2 */
+              "smlal  v23.8h, v9.8b,  v19.8b\n" /* w23, int16, out3 */
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r0]], #32\n" /* load r0 0-3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "ld1  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[wc]], #32\n" /* load wc 0-3 */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v4.8b,  v16.8b\n" /* w24, int16, out0 */
+              "smull  v21.8h, v6.8b,  v16.8b\n" /* w24, int16, out1 */
+              "smull  v22.8h, v8.8b,  v16.8b\n" /* w24, int16, out2 */
+              "smull  v23.8h, v10.8b, v16.8b\n" /* w24, int16, out3 */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r0]], #32\n" /* load r0 4-7 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "stp    q24, q25, [%[ptr_out0]], #32\n"
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "stp    q26, q27, [%[ptr_out0]], #32\n"
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+              "subs   %w[cnt], %w[cnt], #1\n"
+              "stp    q28, q29, [%[ptr_out0]], #32\n"
+              "stp    q30, q31, [%[ptr_out0]], #32\n"
+              "bne    1b\n"
+              : [cnt] "+r"(cnt),
+                [r0] "+r"(inr0),
+                [r1] "+r"(inr1),
+                [r2] "+r"(inr2),
+                [r3] "+r"(inr3),
+                [r4] "+r"(inr4),
+                [wc] "+r"(wptr),
+                [ptr_out0] "+r"(ptr_out0)
+              :
+              : "cc","memory",
+                "v0","v1","v2","v3","v4","v5","v6","v7",
+                "v8","v9","v10","v11","v12","v13",
+                "v14","v15","v16","v17","v18","v19",
+                "v20","v21","v22","v23","v24","v25",
+                "v26","v27","v28","v29","v30","v31"
+              );
+#else
+          auto wptr = weight_c;
+          asm volatile(
+              "vld1.32    {d0-d3}, [%[r0]]!\n"    /* load r0, 0-3 */
+              "vld1.32    {d4-d5}, [%[r0]]!\n"    /* load r0, 4-5 */
+              "vld1.32    {d6-d7},  [%[wptr]]!\n" /* load w0-w1 */
+              "1:\n"
+              /* inr0 */
+              "vmull.s8   q4, d0, d6\n"           /* int16, out0 */
+              "vmull.s8   q5, d2, d6\n"           /* int16, out1 */
+              "vmull.s8   q6, d4, d6\n"           /* int16, out2 */
+              "vmlal.s8   q4, d1, d7\n"           /* int16, out0 */
+              "vld1.32    {d0-d1}, [%[r0]]!\n"    /* load r0, 6-7 */
+              "vmlal.s8   q5, d3, d7\n"           /* int16, out1 */
+              "vmlal.s8   q6, d5, d7\n"           /* int16, out2 */
+              "vmovl.s16  q8, d8\n"               /* mov to out0 low */
+              "vmull.s8   q7, d0, d6\n"           /* int16, out3 */
+              "vmovl.s16  q9, d9\n"               /* mov to out0 hig */
+              "vmovl.s16  q10, d10\n"             /* mov to out1 low */
+              "vmovl.s16  q11, d11\n"             /* mov to out1 hig */
+              "vmlal.s8   q7, d1, d7\n"           /* int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w2-w3 */
+              "vmovl.s16  q12, d12\n"             /* mov to out2 low */
+              "vmovl.s16  q13, d13\n"             /* mov to out2 hig */
+              "vmovl.s16  q14, d14\n"             /* mov to out3 low */
+              "vmovl.s16  q15, d15\n"             /* mov to out3 hig */
+
+              "vmull.s8   q4, d2, d6\n"           /* w2, int16, out0 */
+              "vmull.s8   q5, d4, d6\n"           /* w2, int16, out1 */
+              "vmull.s8   q6, d0, d6\n"           /* w2, int16, out2 */
+              "vmlal.s8   q4, d3, d7\n"           /* w3, int16, out0 */
+              "vld1.32    {d2-d3}, [%[r0]]!\n"    /* load r0, 8-9 */
+              "vmlal.s8   q5, d5, d7\n"           /* w3, int16, out1 */
+              "vmlal.s8   q6, d1, d7\n"           /* w3, int16, out2 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vmull.s8   q7, d2, d6\n"           /* w2, int16, out3 */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d3, d7\n"           /* w3, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w4-w5 */
+              "vld1.32    {d5}, [%[r0]]\n"        /* load r0, 10 */
+              "sub %[r0], %[r0], #16\n"           /* r0 = r0 - 16 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d4, d6\n"           /* w4, int16, out0 */
+              "vmull.s8   q5, d0, d6\n"           /* w4, int16, out1 */
+              "vmull.s8   q6, d2, d6\n"           /* w4, int16, out2 */
+              "vmull.s8   q7, d5, d6\n"           /* w4, int16, out3 */
+              "vld1.32    {d0-d3}, [%[r1]]!\n"    /* load r1, 0-3 */
+              "vld1.32    {d4-d5}, [%[r1]]!\n"    /* load r1, 4-5 */
+              /* inr1 */
+              "vmlal.s8   q4, d0, d7\n"           /* w5, int16, out0 */
+              "vmlal.s8   q5, d2, d7\n"           /* w5, int16, out1 */
+              "vmlal.s8   q6, d4, d7\n"           /* w5, int16, out2 */
+              "vld1.32    {d0}, [%[r1]]!\n"       /* load r1, 6 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d0, d7\n"           /* w5, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w6-w7 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d1, d6\n"           /* w6, int16, out0 */
+              "vld1.32    {d1}, [%[r1]]!\n"       /* load r1, 7 */
+              "vmull.s8   q5, d3, d6\n"           /* w6, int16, out1 */
+              "vmull.s8   q6, d5, d6\n"           /* w6, int16, out2 */
+              "vmlal.s8   q4, d2, d7\n"           /* w7, int16, out0 */
+              "vmlal.s8   q5, d4, d7\n"           /* w7, int16, out1 */
+              "vmlal.s8   q6, d0, d7\n"           /* w7, int16, out2 */
+              "vmull.s8   q7, d1, d6\n"           /* w6, int16, out3 */
+              "vld1.32    {d2}, [%[r1]]!\n"       /* load r1, 8 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d2, d7\n"           /* w7, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w8-w9 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d3, d6\n"           /* w8, int16, out0 */
+              "vld1.32    {d3}, [%[r1]]!\n"       /* load r1, 9 */
+              "vmull.s8   q5, d5, d6\n"           /* w8, int16, out1 */
+              "vmull.s8   q6, d1, d6\n"           /* w8, int16, out2 */
+              "vld1.32    {d5}, [%[r1]]\n"        /* load r1, 10 */
+              "vmlal.s8   q4, d4, d7\n"           /* w9, int16, out0 */
+              "vmlal.s8   q5, d0, d7\n"           /* w9, int16, out1 */
+              "vmlal.s8   q6, d2, d7\n"           /* w9, int16, out2 */
+              "vmull.s8   q7, d3, d6\n"           /* w8, int16, out3 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d5, d7\n"           /* w9, int16, out3 */
+              "sub %[r1], %[r1], #16\n"           /* r1 = r1 - 16 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w10-w11 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+              "vld1.32    {d0-d3}, [%[r2]]!\n"    /* load r2, 0-3 */
+              "vld1.32    {d4-d5}, [%[r2]]!\n"    /* load r2, 4-5 */
+
+              /* inr2 */
+              "vmull.s8   q4, d0, d6\n"           /* w10, int16, out0 */
+              "vmull.s8   q5, d2, d6\n"           /* w10, int16, out1 */
+              "vmull.s8   q6, d4, d6\n"           /* w10, int16, out2 */
+              "vmlal.s8   q4, d1, d7\n"           /* w11, int16, out0 */
+              "vld1.32    {d0-d1}, [%[r2]]!\n"    /* load r2, 6-7 */
+              "vmlal.s8   q5, d3, d7\n"           /* w11, int16, out1 */
+              "vmlal.s8   q6, d5, d7\n"           /* w11, int16, out2 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vmull.s8   q7, d0, d6\n"           /* w10, int16, out3 */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d1, d7\n"           /* w11, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w12-w13 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d2, d6\n"           /* w12, int16, out0 */
+              "vmull.s8   q5, d4, d6\n"           /* w12, int16, out1 */
+              "vmull.s8   q6, d0, d6\n"           /* w12, int16, out2 */
+              "vmlal.s8   q4, d3, d7\n"           /* w13, int16, out0 */
+              "vld1.32    {d2-d3}, [%[r2]]!\n"    /* load r2, 8-9 */
+              "vmlal.s8   q5, d5, d7\n"           /* w13, int16, out1 */
+              "vmlal.s8   q6, d1, d7\n"           /* w13, int16, out2 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vmull.s8   q7, d2, d6\n"           /* w12, int16, out3 */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d3, d7\n"           /* w13, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w14-w15 */
+              "vld1.32    {d5}, [%[r2]]\n"        /* load r2, 10 */
+              "sub %[r2], %[r2], #16\n"           /* r2 = r2 - 16 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d4, d6\n"           /* w14, int16, out0 */
+              "vmull.s8   q5, d0, d6\n"           /* w14, int16, out1 */
+              "vmull.s8   q6, d2, d6\n"           /* w14, int16, out2 */
+              "vmull.s8   q7, d5, d6\n"           /* w14, int16, out3 */
+              "vld1.32    {d0-d3}, [%[r3]]!\n"    /* load r3, 0-3 */
+              "vld1.32    {d4-d5}, [%[r3]]!\n"    /* load r3, 4-5 */
+              /* inr3 */
+              "vmlal.s8   q4, d0, d7\n"           /* w15, int16, out0 */
+              "vmlal.s8   q5, d2, d7\n"           /* w15, int16, out1 */
+              "vmlal.s8   q6, d4, d7\n"           /* w15, int16, out2 */
+              "vld1.32    {d0}, [%[r3]]!\n"       /* load r3, 6 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d0, d7\n"           /* w15, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w16-w17 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d1, d6\n"           /* w16, int16, out0 */
+              "vld1.32    {d1}, [%[r3]]!\n"       /* load r3, 7 */
+              "vmull.s8   q5, d3, d6\n"           /* w16, int16, out1 */
+              "vmull.s8   q6, d5, d6\n"           /* w16, int16, out2 */
+              "vmlal.s8   q4, d2, d7\n"           /* w17, int16, out0 */
+              "vmlal.s8   q5, d4, d7\n"           /* w17, int16, out1 */
+              "vmlal.s8   q6, d0, d7\n"           /* w17, int16, out2 */
+              "vmull.s8   q7, d1, d6\n"           /* w16, int16, out3 */
+              "vld1.32    {d2}, [%[r3]]!\n"       /* load r3, 8 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d2, d7\n"           /* w17, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w18-w19 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d3, d6\n"           /* w18, int16, out0 */
+              "vld1.32    {d3}, [%[r3]]!\n"       /* load r3, 9 */
+              "vmull.s8   q5, d5, d6\n"           /* w18, int16, out1 */
+              "vmull.s8   q6, d1, d6\n"           /* w18, int16, out2 */
+              "vld1.32    {d5}, [%[r3]]\n"        /* load r3, 10 */
+              "vmlal.s8   q4, d4, d7\n"           /* w19, int16, out0 */
+              "vmlal.s8   q5, d0, d7\n"           /* w19, int16, out1 */
+              "vmlal.s8   q6, d2, d7\n"           /* w19, int16, out2 */
+              "vmull.s8   q7, d3, d6\n"           /* w18, int16, out3 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d5, d7\n"           /* w19, int16, out3 */
+              "sub %[r3], %[r3], #16\n"           /* r3 = r3 - 16 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w20-w21 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+              "vld1.32    {d0-d3}, [%[r4]]!\n"    /* load r4, 0-3 */
+              "vld1.32    {d4-d5}, [%[r4]]!\n"    /* load r4, 4-5 */
+
+              /* inr4 */
+              "vmull.s8   q4, d0, d6\n"           /* w20, int16, out0 */
+              "vmull.s8   q5, d2, d6\n"           /* w20, int16, out1 */
+              "vmull.s8   q6, d4, d6\n"           /* w20, int16, out2 */
+              "vmlal.s8   q4, d1, d7\n"           /* w21, int16, out0 */
+              "vld1.32    {d0-d1}, [%[r4]]!\n"    /* load r4, 6-7 */
+              "vmlal.s8   q5, d3, d7\n"           /* w21, int16, out1 */
+              "vmlal.s8   q6, d5, d7\n"           /* w21, int16, out2 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vmull.s8   q7, d0, d6\n"           /* w20, int16, out3 */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d1, d7\n"           /* w21, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w22-w23 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d2, d6\n"           /* w22, int16, out0 */
+              "vmull.s8   q5, d4, d6\n"           /* w22, int16, out1 */
+              "vmull.s8   q6, d0, d6\n"           /* w22, int16, out2 */
+              "vmlal.s8   q4, d3, d7\n"           /* w23, int16, out0 */
+              "vld1.32    {d2-d3}, [%[r4]]!\n"    /* load r4, 7-8 */
+              "vmlal.s8   q5, d5, d7\n"           /* w23, int16, out1 */
+              "vmlal.s8   q6, d1, d7\n"           /* w23, int16, out2 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vmull.s8   q7, d2, d6\n"           /* w22, int16, out3 */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d3, d7\n"           /* w23, int16, out3 */
+              "vld1.32    {d6}, [%[wptr]]!\n"     /* load w24 */
+              "vld1.32    {d5}, [%[r4]]\n"        /* load r4, 10 */
+              "sub %[r4], %[r4], #16\n"           /* r4 = r4 - 16 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+              "sub %[wptr], %[wptr], #200 \n"     /*  wptr = wptr - 200 */
+
+              "vmull.s8   q4, d4, d6\n"           /* w22, int16, out0 */
+              "vmull.s8   q5, d0, d6\n"           /* w22, int16, out1 */
+              "vmull.s8   q6, d2, d6\n"           /* w22, int16, out2 */
+              "vmull.s8   q7, d5, d6\n"           /* w22, int16, out3 */
+              "vld1.32    {d0-d3}, [%[r0]]!\n"    /* load r0, 0-3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w0-w1 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vld1.32    {d4-d5}, [%[r0]]!\n"    /* load r0, 0-3 */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vst1.32    {d16-d19},  [%[ptr_out0]]!\n"/* store out0 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vst1.32    {d20-d23},  [%[ptr_out0]]!\n"/*store out1 */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+              "subs       %[cnt], #1\n"           /* cnt = cnt - 1 */
+              "vst1.32    {d24-d27},  [%[ptr_out0]]!\n"/* store out2 */
+              "vst1.32    {d28-d31},  [%[ptr_out0]]!\n"/* store out3 */
+              "bne 1b\n"                          /* branch main loop */
+              : [cnt] "+r"(cnt),
+                [r0] "+r"(inr0),
+                [r1] "+r"(inr1),
+                [r2] "+r"(inr2),
+                [r3] "+r"(inr3),
+                [r4] "+r"(inr4),
+                [ptr_out0] "+r"(ptr_out0),
+                [wptr] "+r"(wptr)
+              :
+              : "cc",
+                "memory",
+                "q0",
+                "q1",
+                "q2",
+                "q3",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11",
+                "q12",
+                "q13",
+                "q14",
+                "q15");
+#endif
+          // clang-format on
+          block_inr0 = block_inr2;
+          block_inr1 = block_inr3;
+          block_inr2 = block_inr4;
+          block_inr3 = block_inr2 + in_len;
+          block_inr4 = block_inr3 + in_len;
+        }
+        write_int32_nchwc8_to_nchw<Dtype>(pre_out,
+                                          reinterpret_cast<Dtype*>(dout_batch),
+                                          c,
+                                          c + hout_c_block,
+                                          h,
+                                          h + h_kernel,
+                                          0,
+                                          wout_round,
+                                          chout,
+                                          hout,
+                                          wout,
+                                          flag_relu,
+                                          bias_local,
+                                          flag_bias,
+                                          ptr_write,
+                                          scale + c);
+      }
+    }
+  }
+}
+
+template void conv_depthwise_5x5s2_int8<int8_t>(int8_t* dout,
+                                                const int8_t* din,
+                                                const int8_t* weights,
+                                                const float* scale,
+                                                const float* bias,
+                                                bool flag_bias,
+                                                bool flag_relu,
+                                                int num,
+                                                int chin,
+                                                int hin,
+                                                int win,
+                                                int hout,
+                                                int wout,
+                                                int padw,
+                                                int padh,
+                                                ARMContext* ctx);
+
+template void conv_depthwise_5x5s2_int8<float>(float* dout,
+                                               const int8_t* din,
+                                               const int8_t* weights,
+                                               const float* scale,
+                                               const float* bias,
+                                               bool flag_bias,
+                                               bool flag_relu,
+                                               int num,
+                                               int chin,
+                                               int hin,
+                                               int win,
+                                               int hout,
+                                               int wout,
+                                               int padw,
+                                               int padh,
+                                               ARMContext* ctx);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h
index b2d16d18d2300ea51de8c8e9f25664ffdf4aebc7..85404d6a6e2e6246677857be8231e15afa86210d 100644
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -20,6 +20,7 @@
 #include "lite/backends/arm/math/sgemm.h"
 #include "lite/backends/arm/math/type_trans.h"
 #include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
@@ -28,6 +29,7 @@ namespace arm {
 namespace math {
 
 #define LITEMAX(a, b) ((a) > (b) ? (a) : (b))
+#define LITEMIN(a, b) ((a) < (b) ? (a) : (b))
 #define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b))
 
 template <PrecisionType Ptype>
@@ -254,6 +256,7 @@ inline void prepack_input_nxwc4_dw(const float* din,
     LOG(FATAL) << "prepack_dw_input, valid height must > zero";
   }
   float32x4_t vzero = vdupq_n_f32(0.f);
+  auto out_data = dout;
 
   int size_w = we - ws;
   int w0 = ws < 0 ? 0 : ws;
@@ -269,6 +272,7 @@ inline void prepack_input_nxwc4_dw(const float* din,
 
   bool flag_ext_l = left_remain > 0;
   int left_sl = 4 - left_remain;
+  int left_valid_sl = left_sl > width ? width : left_sl;
   uint32x4_t vmask_padl;
   bool flag_mask_l = false;
   if (flag_ext_l) {
@@ -290,6 +294,7 @@ inline void prepack_input_nxwc4_dw(const float* din,
   }
   int size_c = width * height;
   for (int h = hs; h < he; ++h) {
+    dout = out_data + (h - hs) * 4 * size_w;
     auto ptr_c0 = din + cs * size_c + h * width;
     auto ptr_c1 = ptr_c0 + size_c;
     auto ptr_c2 = ptr_c1 + size_c;
@@ -351,10 +356,10 @@ inline void prepack_input_nxwc4_dw(const float* din,
       }
       transpose_4x4(vc0, vc1, vc2, vc3, dout);
       dout += 16;
-      ptr_c0 += left_sl;
-      ptr_c1 += left_sl;
-      ptr_c2 += left_sl;
-      ptr_c3 += left_sl;
+      ptr_c0 += left_valid_sl;
+      ptr_c1 += left_valid_sl;
+      ptr_c2 += left_valid_sl;
+      ptr_c3 += left_valid_sl;
     }
     /// valid
     for (int i = 0; i < cnt_valid; ++i) {
@@ -586,7 +591,238 @@ inline void prepack_input_nxwc8_int8_dw(const int8_t* din,
     }
   }
 }
-
+// clang-format off
+#ifdef __aarch64__
+#define NCHWC1_TRANS_FP32_COMPUTE                                      \
+  "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "ldr q1, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "ldr q2, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "ldr q3, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "movi v20.4s, #0                \n" /* for relu */                   \
+  "1:                             \n" /* main loop*/
+
+#define NCHWC1_TRANS_FP32_RELU                 \
+  "fmax   v0.4s, v0.4s, v20.4s    \n" /*relu*/ \
+  "fmax   v1.4s, v1.4s, v20.4s    \n" /*relu*/ \
+  "fmax   v2.4s, v2.4s, v20.4s    \n" /*relu*/ \
+  "fmax   v3.4s, v3.4s, v20.4s    \n" /*relu*/
+
+#define NCHWC1_TRANS_FP32_RELU6                    \
+  "fmin   v0.4s, v0.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v1.4s, v1.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v2.4s, v2.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v3.4s, v3.4s, %[six].4s  \n" /* relu6 */
+
+#define NCHWC1_TRANS_FP32_LEAKY_RELU                   \
+  "fcmge v4.4s, v0.4s, v20.4s \n"      /* vcgeq_f32 */ \
+  "fcmge v5.4s, v1.4s, v20.4s \n"      /* vcgeq_f32 */ \
+  "fcmge v6.4s, v2.4s, v20.4s \n"      /* vcgeq_f32 */ \
+  "fcmge v7.4s, v3.4s, v20.4s \n"      /* vcgeq_f32 */ \
+  "fmul v8.4s, v0.4s, %[scale].4s  \n" /* mul */       \
+  "fmul v9.4s, v1.4s, %[scale].4s  \n" /* mul */       \
+  "fmul v10.4s, v2.4s, %[scale].4s \n" /* mul */       \
+  "fmul v11.4s, v3.4s, %[scale].4s \n" /* mul */       \
+  "bif  v0.16b, v8.16b, v4.16b  \n"    /* choose*/     \
+  "bif  v1.16b, v9.16b, v5.16b  \n"    /* choose*/     \
+  "bif  v2.16b, v10.16b, v6.16b \n"    /* choose*/     \
+  "bif  v3.16b, v11.16b, v7.16b \n"    /* choose*/
+
+#define NCHWC1_TRANS_FP32_STORE                                        \
+  "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/               \
+                                                                       \
+  "str    q0, [%[doutc0r0]], #16  \n" /* store c0r0*/                  \
+  "str    q1, [%[doutc0r0]], #16  \n" /* store c0r0*/                  \
+  "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "ldr q1, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "str    q2, [%[doutc0r0]], #16  \n" /* store c0r0*/                  \
+  "str    q3, [%[doutc0r0]], #16  \n" /* store c2r0*/                  \
+  "ldr q2, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "ldr q3, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+                                                                       \
+  "bne    1b                      \n" /* jump to main loop*/
+#else
+#define NCHWC1_TRANS_FP32_COMPUTE                                       \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0 \n" \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!                 @ load data, c0r0 \n" \
+  "vmov.u32 q15, #0                       @ dump zero\n"                \
+  "1:                                     @ main loop\n"
+
+#define NCHWC1_TRANS_FP32_RELU                      \
+  "vmax.f32   q0, q0, q15                 @ relu\n" \
+  "vmax.f32   q1, q1, q15                 @ relu\n" \
+  "vmax.f32   q2, q2, q15                 @ relu\n" \
+  "vmax.f32   q3, q3, q15                 @ relu\n"
+
+#define NCHWC1_TRANS_FP32_RELU6                  \
+  "vmin.f32   q0, q0, %q[six]        @ relu6 \n" \
+  "vmin.f32   q1, q1, %q[six]        @ relu6 \n" \
+  "vmin.f32   q2, q2, %q[six]        @ relu6 \n" \
+  "vmin.f32   q3, q3, %q[six]        @ relu6 \n"
+
+#define NCHWC1_TRANS_FP32_LEAKY_RELU          \
+  "vcge.f32   q5, q0, q15        @ q0 > 0 \n" \
+  "vcge.f32   q6, q1, q15        @ q0 > 0 \n" \
+  "vcge.f32   q7, q2, q15        @ q0 > 0 \n" \
+  "vcge.f32   q8, q3, q15        @ q0 > 0 \n" \
+  "vmul.f32 q9, q0, %q[scale] \n"             \
+  "vmul.f32 q10, q1, %q[scale] \n"            \
+  "vmul.f32 q11, q2, %q[scale] \n"            \
+  "vmul.f32 q12, q3, %q[scale] \n"            \
+  "vbif q0, q9, q5 @ choose \n"               \
+  "vbif q1, q10, q6 @ choose \n"              \
+  "vbif q2, q11, q7 @ choose \n"              \
+  "vbif q3, q12, q8 @ choose \n"
+
+#define NCHWC1_TRANS_FP32_STORE                                 \
+  "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result  \n"   \
+  "vst1.32  {d2-d3}, [%[doutc0r0]]!       @ store result, \n"   \
+  "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"   \
+                                                                \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data      \n"  \
+  "vst1.32  {d4-d5}, [%[doutc0r0]]!       @ store result   \n"  \
+  "vst1.32  {d6-d7}, [%[doutc0r0]]!       @ store result,  \n"  \
+                                                                \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!         @ load data     \n"   \
+                                                                \
+  "bne    1b                              @ jump to main loop\n"
+#endif
+// clang-format on
+inline void act_switch_c1_fp32(const float* din_ptr,
+                               float* doutc0_ptr,
+                               int cnt_loop,
+                               const operators::ActivationParam* act_param) {
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t six = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t scale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    switch (act_param->active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_RELU
+                         NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v20");
+#else
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_RELU
+                         NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
+#ifdef __aarch64__
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_RELU
+                         NCHWC1_TRANS_FP32_RELU6 NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [six] "w"(six)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v20");
+#else
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_RELU
+                         NCHWC1_TRANS_FP32_RELU6 NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [six] "w"(six)
+                     : "q0", "q1", "q2", "q3", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_LEAKY_RELU
+                         NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [scale] "w"(scale)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v20");
+#else
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_LEAKY_RELU
+                         NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [scale] "w"(scale)
+                     : "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q15");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param->active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [cnt] "+r"(cnt_loop),
+                   [ptr_din] "+r"(din_ptr)
+                 :
+                 : "v0", "v1", "v2", "v3", "v20");
+#else
+    asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [ptr_din] "+r"(din_ptr),
+                   [cnt] "+r"(cnt_loop)
+                 :
+                 : "q0", "q1", "q2", "q3", "q15");
+#endif
+  }
+}
 /*wirte result in outputs
 * input din: [n, c, h, w], output dout: [n, c, h, w]
 */
@@ -602,13 +838,14 @@ inline bool write_to_output_c1_fp32(const float* din,
                                     int height,
                                     int width,
                                     bool flag_relu,
-                                    float* trash_ptr) {
+                                    float* trash_ptr,
+                                    operators::ActivationParam* act_param) {
   if (cs > channel) {
     return true;
   }
 
   const int c1 = 1;
-  const int w4 = 4;
+  const int w4 = 16;
 
   int size_c_out = width * height;
 
@@ -620,98 +857,53 @@ inline bool write_to_output_c1_fp32(const float* din,
 
   int w_round = we - ws;
   int cnt = (width - ws) / w4;
-
+  int remain = (width - ws) % w4;
   for (int i = 0; i < size_h; i++) {
     int size_w = i * width;
     float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
     const float* din_hei_ptr = ptr_din + i * w_round * c1;
     if (cnt > 0) {
       int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "fmax   v1.4s, v0.4s, v20.4s    \n" /*relu*/
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q1, [%[doutc0r0]], #16  \n" /* store c0r0*/
-            "bne    1b                      \n" /* jump to main loop*/
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d1}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-
-            "vmax.f32   q1, q0, q15                 @ relu\n"
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
-
-            "vst1.32  {d2-d3}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "1:                             \n" /* main loop*/
-            "str    q0, [%[doutc0r0]], #16  \n" /* store c2r0*/
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0");
-#else
-        asm volatile(
-            "vld1.32 {d0-d1}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c0r1, c0r2, c0r3\n"
-            "1:                                     @ main loop\n"
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0");
-#endif
-      }
+      act_switch_c1_fp32(din_hei_ptr, doutc0_ptr, cnt_loop, act_param);
     }
-    if (we > width) {
+    if (remain > 0) {
       int offset = i * w_round * c1 + c1 * w4 * cnt;
       din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          din_hei_ptr++;
+      doutc0_ptr += w4 * cnt;
+      int j = w4 * cnt;
+      if (act_param != nullptr && act_param->has_active) {
+        float six = act_param->Relu_clipped_coef;
+        float scale = act_param->Leaky_relu_alpha;
+        switch (act_param->active_type) {
+          case lite_api::ActivationType::kRelu:
+            for (; j < width; ++j) {
+              *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
+              din_hei_ptr++;
+            }
+            break;
+          case lite_api::ActivationType::kRelu6:
+            /* 0 <= din <= 6 */
+            for (; j < width; ++j) {
+              float tmp = LITEMAX(din_hei_ptr[0], 0.f);
+              *(doutc0_ptr++) = LITEMIN(tmp, six);
+              din_hei_ptr++;
+            }
+            break;
+          case lite_api::ActivationType::kLeakyRelu:
+            /*din = din >= 0 ? din : din * scale*/
+            for (; j < width; ++j) {
+              if (din_hei_ptr[0] >= 0) {
+                *(doutc0_ptr++) = din_hei_ptr[0];
+              } else {
+                *(doutc0_ptr++) = din_hei_ptr[0] * scale;
+              }
+              din_hei_ptr++;
+            }
+            break;
+          default:
+            LOG(FATAL) << "this act_type: "
+                       << static_cast<int>(act_param->active_type)
+                       << " fuse not support";
         }
       } else {
         for (; j < width; ++j) {
@@ -722,7 +914,224 @@ inline bool write_to_output_c1_fp32(const float* din,
   }
   return true;
 }
-
+// clang-format off
+#ifdef __aarch64__
+#define NCHWC2_TRANS_FP32_COMPUTE                                      \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "movi v20.4s, #0                \n" /* for relu */                   \
+  "1:                             \n" /* main loop*/                   \
+  "trn1   v2.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/                \
+  "trn2   v3.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/                \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "trn1   v4.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/               \
+  "trn2   v5.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
+
+#define NCHWC2_TRANS_FP32_RELU                 \
+  "fmax   v2.4s, v4.4s, v20.4s    \n" /*relu*/ \
+  "fmax   v3.4s, v5.4s, v20.4s    \n" /*relu*/
+
+#define NCHWC2_TRANS_FP32_RELU6                    \
+  "fmin   v2.4s, v2.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v3.4s, v3.4s, %[six].4s  \n" /* relu6 */
+
+#define NCHWC2_TRANS_FP32_LEAKY_RELU                   \
+  "fcmge v6.4s, v2.4s, v20.4s \n"      /* vcgeq_f32 */ \
+  "fcmge v7.4s, v3.4s, v20.4s \n"      /* vcgeq_f32 */ \
+  "fmul v4.4s, v2.4s, %[scale].4s \n" /* mul */        \
+  "fmul v5.4s, v3.4s, %[scale].4s \n" /* mul */        \
+  "bif  v2.16b, v4.16b, v6.16b \n"    /* choose*/      \
+  "bif  v3.16b, v5.16b, v7.16b \n"    /* choose*/
+
+#define NCHWC2_TRANS_FP32_STORE                          \
+  "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/ \
+                                                         \
+  "str    q2, [%[doutc0r0]], #16  \n" /* store c0r0*/    \
+  "str    q3, [%[doutc1r0]], #16  \n" /* store c2r0*/    \
+                                                         \
+  "bne    1b                      \n" /* jump to main loop*/
+#else
+#define NCHWC2_TRANS_FP32_COMPUTE                                      \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data, c0r0, c1r0 \n"  \
+  "vmov.u32 q15, #0                       @ dump zero\n"               \
+  "1:                                     @ main loop\n"               \
+  "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "   \
+  "c1r0, c1r1 \n"                                                      \
+  "vtrn.32 d2, d3                         @ trans data:c0r2, c0r3, "   \
+  "c1r2, c1r3 \n"                                                      \
+                                                                       \
+  "vswp  d1, d2                           @ swap data\n"
+
+#define NCHWC2_TRANS_FP32_RELU                      \
+  "vmax.f32   q0, q0, q15                 @ relu\n" \
+  "vmax.f32   q1, q1, q15                 @ relu\n"
+
+#define NCHWC2_TRANS_FP32_RELU6                  \
+  "vmin.f32   q0, q0, %q[six]        @ relu6 \n" \
+  "vmin.f32   q1, q1, %q[six]        @ relu6 \n"
+
+#define NCHWC2_TRANS_FP32_LEAKY_RELU          \
+  "vcge.f32   q5, q0, q15        @ q0 > 0 \n" \
+  "vcge.f32   q6, q1, q15        @ q0 > 0 \n" \
+  "vmul.f32 q9, q0, %q[scale] \n"             \
+  "vmul.f32 q10, q1, %q[scale] \n"            \
+  "vbif q0, q9, q5 @ choose \n"               \
+  "vbif q1, q10, q6 @ choose \n"
+
+#define NCHWC2_TRANS_FP32_STORE                                 \
+  "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add pointer\n"   \
+  "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add pointer\n"   \
+                                                                \
+  "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"   \
+                                                                \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"       \
+                                                                \
+  "bne    1b                              @ jump to main loop\n"
+#endif
+// clang-format on
+inline void act_switch_c2_fp32(const float* din_ptr,
+                               float* doutc0_ptr,
+                               float* doutc1_ptr,
+                               int cnt_loop,
+                               const operators::ActivationParam* act_param) {
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t six = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t scale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    switch (act_param->active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v20");
+#else
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
+#ifdef __aarch64__
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_RELU6 NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [six] "w"(six)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v20");
+#else
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_RELU6 NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [six] "w"(six)
+                     : "q0", "q1", "q2", "q3", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_LEAKY_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [scale] "w"(scale)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v20");
+#else
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_LEAKY_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [scale] "w"(scale)
+                     : "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q15");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param->active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [cnt] "+r"(cnt_loop),
+                   [ptr_din] "+r"(din_ptr)
+                 :
+                 : "v0", "v1", "v2", "v3", "v4", "v5", "v20");
+#else
+    asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [ptr_din] "+r"(din_ptr),
+                   [cnt] "+r"(cnt_loop)
+                 :
+                 : "q0", "q1", "q2", "q3", "q15");
+#endif
+  }
+}
 /*wirte result in outputs
 * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
 */
@@ -738,11 +1147,11 @@ inline bool write_to_output_c2_fp32(const float* din,
                                     int height,
                                     int width,
                                     bool flag_relu,
-                                    float* trash_ptr) {
+                                    float* trash_ptr,
+                                    operators::ActivationParam* act_param) {
   if (cs > channel) {
     return true;
   }
-
   const int c2 = 2;
   const int w4 = 4;
 
@@ -775,141 +1184,56 @@ inline bool write_to_output_c2_fp32(const float* din,
     const float* din_hei_ptr = ptr_din + i * w_round * c2;
     if (cnt > 0) {
       int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v2.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v3.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3  */
-            "trn1   v4.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-            "trn2   v5.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-
-            "fmax   v2.4s, v4.4s, v20.4s    \n" /*relu*/
-            "fmax   v3.4s, v5.4s, v20.4s    \n" /*relu*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-
-            "str    q2, [%[doutc0r0]], #16  \n" /* store c0r0*/
-            "str    q3, [%[doutc1r0]], #16  \n" /* store c2r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "
-            "c1r0, c1r1 \n"
-            "vtrn.32 d2, d3                         @ trans data:c0r2, c0r3, "
-            "c1r2, c1r3 \n"
-
-            "vswp  d1, d2                           @ swap data\n"
-
-            "vmax.f32   q0, q0, q15                 @ relu\n"
-            "vmax.f32   q1, q1, q15                 @ relu\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v2.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v3.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3  */
-            "trn1   v4.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-            "trn2   v5.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-
-            "str    q4, [%[doutc0r0]], #16  \n" /* store c0r0*/
-            "str    q5, [%[doutc1r0]], #16  \n" /* store c2r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "
-            "c1r0, c1r1 \n"
-            "vtrn.32 d2, d3                         @ trans data:c0r2, c0r3, "
-            "c1r2, c1r3 \n"
-
-            "vswp  d1, d2                           @ swap data\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
-#endif
-      }
+      act_switch_c2_fp32(
+          din_hei_ptr, doutc0_ptr, doutc1_ptr, cnt_loop, act_param);
     }
     if (we > width) {
       int offset = i * w_round * c2 + c2 * w4 * cnt;
       din_hei_ptr = ptr_din + offset;
+      doutc0_ptr += w4 * cnt;
+      doutc1_ptr += w4 * cnt;
       int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
-          din_hei_ptr += 2;
+      if (act_param != nullptr && act_param->has_active) {
+        float six = act_param->Relu_clipped_coef;
+        float scale = act_param->Leaky_relu_alpha;
+        switch (act_param->active_type) {
+          case lite_api::ActivationType::kRelu:
+            for (; j < width; ++j) {
+              *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
+              *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
+              din_hei_ptr += 2;
+            }
+            break;
+          case lite_api::ActivationType::kRelu6:
+            /* 0 <= din <= 6 */
+            for (; j < width; ++j) {
+              float tmp1 = LITEMAX(din_hei_ptr[0], 0.f);
+              float tmp2 = LITEMAX(din_hei_ptr[1], 0.f);
+              *(doutc0_ptr++) = LITEMIN(tmp1, six);
+              *(doutc1_ptr++) = LITEMIN(tmp2, six);
+              din_hei_ptr += 2;
+            }
+            break;
+          case lite_api::ActivationType::kLeakyRelu:
+            /*din = din >= 0 ? din : din * scale*/
+            for (; j < width; ++j) {
+              if (din_hei_ptr[0] >= 0) {
+                *(doutc0_ptr++) = din_hei_ptr[0];
+              } else {
+                *(doutc0_ptr++) = din_hei_ptr[0] * scale;
+              }
+              if (din_hei_ptr[1] >= 0) {
+                *(doutc1_ptr++) = din_hei_ptr[1];
+              } else {
+                *(doutc1_ptr++) = din_hei_ptr[1] * scale;
+              }
+              din_hei_ptr += 2;
+            }
+            break;
+          default:
+            LOG(FATAL) << "this act_type: "
+                       << static_cast<int>(act_param->active_type)
+                       << " fuse not support";
         }
       } else {
         for (; j < width; ++j) {
@@ -921,7 +1245,309 @@ inline bool write_to_output_c2_fp32(const float* din,
   }
   return true;
 }
-
+// clang-format off
+#ifdef __aarch64__
+#define NCHWC4_TRANS_FP32_COMPUTE                                   \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */ \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */ \
+  "movi v20.4s, #0                \n" /* for relu */                \
+  "1:                             \n" /* main loop*/                \
+  "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/             \
+  "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/             \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */ \
+  "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/             \
+  "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/             \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */ \
+  "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/            \
+  "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/            \
+  "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/            \
+  "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
+
+#define NCHWC4_TRANS_FP32_RELU                 \
+  "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
+
+#define NCHWC4_TRANS_FP32_RELU6                      \
+  "fmin   v16.4s, v16.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v17.4s, v17.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v18.4s, v18.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v19.4s, v19.4s, %[six].4s  \n" /* relu6 */
+
+#define NCHWC4_TRANS_FP32_LEAKY_RELU                    \
+  "fcmge v8.4s, v16.4s, v20.4s  \n"     /* vcgeq_f32 */ \
+  "fcmge v9.4s, v17.4s, v20.4s  \n"     /* vcgeq_f32 */ \
+  "fcmge v10.4s, v18.4s, v20.4s \n"     /* vcgeq_f32 */ \
+  "fcmge v11.4s, v19.4s, v20.4s \n"     /* vcgeq_f32 */ \
+  "fmul v4.4s, v16.4s, %[scale].4s \n"  /* mul */       \
+  "fmul v5.4s, v17.4s, %[scale].4s \n"  /* mul */       \
+  "fmul v6.4s, v18.4s, %[scale].4s \n"  /* mul */       \
+  "fmul v7.4s, v19.4s, %[scale].4s \n"  /* mul */       \
+  "bif  v16.16b, v4.16b, v8.16b  \n"    /* choose*/     \
+  "bif  v17.16b, v5.16b, v9.16b  \n"    /* choose*/     \
+  "bif  v18.16b, v6.16b, v10.16b \n"    /* choose*/     \
+  "bif  v19.16b, v7.16b, v11.16b \n"    /* choose*/
+
+#define NCHWC4_TRANS_FP32_STORE                          \
+  "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/    \
+  "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/    \
+  "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/    \
+  "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/    \
+                                                         \
+  "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/ \
+  "bne    1b                      \n" /* jump to main loop*/
+#else
+#define NCHWC4_TRANS_FP32_COMPUTE                                     \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!                 @load data \n"      \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!         @load data \n"              \
+  "vmov.u32 q15, #0                       @ dump zero\n"              \
+  "1:                                     @ main loop\n"              \
+  "vtrn.32 q0, q1                         @ trans data:c00c01c20c21 " \
+  "\n"                                                                \
+  "vtrn.32 q2, q3                         @ trans data:c02c03c22c23 " \
+  "\n"                                                                \
+                                                                      \
+  "vswp   d1, d4                          @ swap data\n"              \
+  "vswp   d3, d6                          @ swap data\n"
+
+#define NCHWC4_TRANS_FP32_RELU             \
+  "vmax.f32   q0, q0, q15        @ relu\n" \
+  "vmax.f32   q1, q1, q15        @ relu\n" \
+  "vmax.f32   q2, q2, q15        @ relu\n" \
+  "vmax.f32   q3, q3, q15        @ relu\n"
+
+#define NCHWC4_TRANS_FP32_RELU6                  \
+  "vmin.f32   q0, q0, %q[six]        @ relu6 \n" \
+  "vmin.f32   q1, q1, %q[six]        @ relu6 \n" \
+  "vmin.f32   q2, q2, %q[six]        @ relu6 \n" \
+  "vmin.f32   q3, q3, %q[six]        @ relu6 \n"
+
+#define NCHWC4_TRANS_FP32_LEAKY_RELU          \
+  "vcge.f32   q5, q0, q15        @ q0 > 0 \n" \
+  "vcge.f32   q6, q1, q15        @ q0 > 0 \n" \
+  "vcge.f32   q7, q2, q15        @ q0 > 0 \n" \
+  "vcge.f32   q8, q3, q15        @ q0 > 0 \n" \
+  "vmul.f32 q9, q0, %q[scale] \n"             \
+  "vmul.f32 q10, q1, %q[scale] \n"            \
+  "vmul.f32 q11, q2, %q[scale] \n"            \
+  "vmul.f32 q12, q3, %q[scale] \n"            \
+  "vbif q0, q9, q5 @ choose \n"               \
+  "vbif q1, q10, q6 @ choose \n"              \
+  "vbif q2, q11, q7 @ choose \n"              \
+  "vbif q3, q12, q8 @ choose \n"
+
+#define NCHWC4_TRANS_FP32_STORE                                        \
+  "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n" \
+  "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n" \
+  "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n" \
+  "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n" \
+                                                                       \
+  "subs   %[cnt], %[cnt], #1    @ loop count - 1\n"                    \
+                                                                       \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"                \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"                \
+                                                                       \
+  "bne    1b                            @ jump to main loop\n"
+#endif
+// clang-format on
+inline void act_switch_c4_fp32(const float* din_ptr,
+                               float* doutc0_ptr,
+                               float* doutc1_ptr,
+                               float* doutc2_ptr,
+                               float* doutc3_ptr,
+                               int cnt_loop,
+                               const operators::ActivationParam* act_param) {
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t six = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t scale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    switch (act_param->active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
+#else
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
+#ifdef __aarch64__
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_RELU6 NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [six] "w"(six)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
+#else
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_RELU6 NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [six] "w"(six)
+                     : "q0", "q1", "q2", "q3", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_LEAKY_RELU
+                         NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [scale] "w"(scale)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
+#else
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_LEAKY_RELU
+                         NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [scale] "w"(scale)
+                     : "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q15");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param->active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [doutc2r0] "+r"(doutc2_ptr),
+                   [doutc3r0] "+r"(doutc3_ptr),
+                   [cnt] "+r"(cnt_loop),
+                   [ptr_din] "+r"(din_ptr)
+                 :
+                 : "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v16",
+                   "v17",
+                   "v18",
+                   "v19");
+#else
+    asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [doutc2r0] "+r"(doutc2_ptr),
+                   [doutc3r0] "+r"(doutc3_ptr),
+                   [ptr_din] "+r"(din_ptr),
+                   [cnt] "+r"(cnt_loop)
+                 :
+                 : "q0", "q1", "q2", "q3", "q15");
+#endif
+  }
+}
 /*wirte result in outputs
 * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
 */
@@ -937,11 +1563,13 @@ inline bool write_to_output_c4_fp32(const float* din,
                                     int height,
                                     int width,
                                     bool flag_relu,
-                                    float* trash_ptr) {
+                                    float* trash_ptr,
+                                    operators::ActivationParam* act_param) {
   const int c4 = 4;
   const int w4 = 4;
   const int w_round = we - ws;
   const int ch_n = ce - cs;
+
   if (ch_n != 4) {
     LOG(ERROR) << "write_to_output_c4_fp32 ch_n must be equal 4 and hei_n is "
                   "more than zero";
@@ -958,7 +1586,9 @@ inline bool write_to_output_c4_fp32(const float* din,
 
   int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
 
-  int cnt = (width - ws) / w4;
+  int valid_we = we > width ? width : we;
+  int cnt = (valid_we - ws) / w4;
+  int remain = valid_we - ws - cnt * w4;
 
   for (int i = 0; i < size_h; i++) {
     int size_w = i * width;
@@ -981,206 +1611,751 @@ inline bool write_to_output_c4_fp32(const float* din,
     const float* din_hei_ptr = ptr_din + i * w_round * ch_n;
     if (cnt > 0) {
       int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/
-            "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/
-            "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/
-            "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!         @load data \n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 q0, q1                         @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                         @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                          @ swap data\n"
-            "vswp   d3, d6                          @ swap data\n"
-
-            "vmax.f32   q0, q0, q15        @ relu\n"
-            "vmax.f32   q1, q1, q15        @ relu\n"
-            "vmax.f32   q2, q2, q15        @ relu\n"
-            "vmax.f32   q3, q3, q15        @ relu\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1    @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v16",
-              "v17",
-              "v18",
-              "v19");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!         @load data \n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 q0, q1                         @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                         @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                          @ swap data\n"
-            "vswp   d3, d6                          @ swap data\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1    @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3");
-#endif
-      }
+      act_switch_c4_fp32(din_hei_ptr,
+                         doutc0_ptr,
+                         doutc1_ptr,
+                         doutc2_ptr,
+                         doutc3_ptr,
+                         cnt_loop,
+                         act_param);
     }
-    if (we > width) {
+    if (remain > 0) {
       int offset = i * w_round * c4 + c4 * w4 * cnt;
       din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
-          *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
-          *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f);
-          din_hei_ptr += w4;
+      doutc0_ptr += w4 * cnt;
+      doutc1_ptr += w4 * cnt;
+      doutc2_ptr += w4 * cnt;
+      doutc3_ptr += w4 * cnt;
+      int j = 0;
+      if (act_param != nullptr && act_param->has_active) {
+        float six = act_param->Relu_clipped_coef;
+        float scale = act_param->Leaky_relu_alpha;
+        switch (act_param->active_type) {
+          case lite_api::ActivationType::kRelu:
+            for (; j < remain; ++j) {
+              *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
+              *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
+              *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
+              *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f);
+              din_hei_ptr += 4;
+            }
+            break;
+          case lite_api::ActivationType::kRelu6:
+            /* 0 <= din <= 6 */
+            for (; j < remain; ++j) {
+              float tmp1 = LITEMAX(din_hei_ptr[0], 0.f);
+              float tmp2 = LITEMAX(din_hei_ptr[1], 0.f);
+              float tmp3 = LITEMAX(din_hei_ptr[2], 0.f);
+              float tmp4 = LITEMAX(din_hei_ptr[3], 0.f);
+              *(doutc0_ptr++) = LITEMIN(tmp1, six);
+              *(doutc1_ptr++) = LITEMIN(tmp2, six);
+              *(doutc2_ptr++) = LITEMIN(tmp3, six);
+              *(doutc3_ptr++) = LITEMIN(tmp4, six);
+              din_hei_ptr += 4;
+            }
+            break;
+          case lite_api::ActivationType::kLeakyRelu:
+            /*din = din >= 0 ? din : din * scale*/
+            for (; j < remain; ++j) {
+              if (din_hei_ptr[0] >= 0) {
+                *(doutc0_ptr++) = din_hei_ptr[0];
+              } else {
+                *(doutc0_ptr++) = din_hei_ptr[0] * scale;
+              }
+              if (din_hei_ptr[1] >= 0) {
+                *(doutc1_ptr++) = din_hei_ptr[1];
+              } else {
+                *(doutc1_ptr++) = din_hei_ptr[1] * scale;
+              }
+              if (din_hei_ptr[2] >= 0) {
+                *(doutc2_ptr++) = din_hei_ptr[2];
+              } else {
+                *(doutc2_ptr++) = din_hei_ptr[2] * scale;
+              }
+              if (din_hei_ptr[3] >= 0) {
+                *(doutc3_ptr++) = din_hei_ptr[3];
+              } else {
+                *(doutc3_ptr++) = din_hei_ptr[3] * scale;
+              }
+              din_hei_ptr += 4;
+            }
+            break;
+          default:
+            LOG(FATAL) << "this act_type: "
+                       << static_cast<int>(act_param->active_type)
+                       << " fuse not support";
         }
       } else {
-        for (; j < width; ++j) {
+        for (; j < remain; ++j) {
           *(doutc0_ptr++) = din_hei_ptr[0];
           *(doutc1_ptr++) = din_hei_ptr[1];
           *(doutc2_ptr++) = din_hei_ptr[2];
           *(doutc3_ptr++) = din_hei_ptr[3];
-          din_hei_ptr += w4;
+          din_hei_ptr += 4;
         }
       }
     }
   }
   return true;
 }
+// clang-format off
+#ifdef __aarch64__
+#define NCHWC8_TRANS_FP32_COMPUTE                                    \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */  \
+  "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+  "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */  \
+  "movi v20.4s, #0                \n" /* for relu */                 \
+  "1:                             \n" /* main loop*/                 \
+  "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/              \
+  "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/              \
+  "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/              \
+  "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/              \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+                                                                     \
+  "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/              \
+  "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/              \
+  "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/              \
+  "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/              \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */  \
+                                                                     \
+  "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/ \
+  "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/ \
+  "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/ \
+  "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/ \
+  "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+                                                                     \
+  "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/ \
+  "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/ \
+  "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/ \
+  "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/ \
+  "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
+
+#define NCHWC8_TRANS_FP32_RELU                 \
+  "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/ \
+                                               \
+  "fmax   v8.4s,  v8.4s,  v20.4s  \n" /*relu*/ \
+  "fmax   v9.4s,  v9.4s,  v20.4s  \n" /*relu*/ \
+  "fmax   v12.4s, v12.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v13.4s, v13.4s, v20.4s  \n" /*relu*/
+
+#define NCHWC8_TRANS_FP32_RELU6                    \
+  "fmin   v16.4s, v16.4s, %[six].4s  \n" /*relu6*/ \
+  "fmin   v17.4s, v17.4s, %[six].4s  \n" /*relu6*/ \
+  "fmin   v18.4s, v18.4s, %[six].4s  \n" /*relu6*/ \
+  "fmin   v19.4s, v19.4s, %[six].4s  \n" /*relu6*/ \
+                                                   \
+  "fmin   v8.4s,  v8.4s,  %[six].4s  \n" /*relu6*/ \
+  "fmin   v9.4s,  v9.4s,  %[six].4s  \n" /*relu6*/ \
+  "fmin   v12.4s, v12.4s, %[six].4s  \n" /*relu6*/ \
+  "fmin   v13.4s, v13.4s, %[six].4s  \n" /*relu6*/
+
+#define NCHWC8_TRANS_FP32_LEAKY_RELU                \
+  "fcmge v10.4s, v16.4s, v20.4s \n" /* vcgeq_u32 */ \
+  "fcmge v11.4s, v17.4s, v20.4s \n" /* vcgeq_u32 */ \
+  "fcmge v14.4s, v18.4s, v20.4s \n" /* vcgeq_u32 */ \
+  "fcmge v15.4s, v19.4s, v20.4s \n" /* vcgeq_u32 */ \
+                                                    \
+  "fcmge v21.4s, v8.4s, v20.4s \n"  /* vcgeq_u32 */ \
+  "fcmge v22.4s, v9.4s, v20.4s \n"  /* vcgeq_u32 */ \
+  "fcmge v23.4s, v12.4s, v20.4s \n" /* vcgeq_u32 */ \
+  "fcmge v24.4s, v13.4s, v20.4s \n" /* vcgeq_u32 */ \
+                                                    \
+  "fmul v25.4s, v16.4s, %[scale].4s \n" /* mul */   \
+  "fmul v26.4s, v17.4s, %[scale].4s \n" /* mul */   \
+  "fmul v27.4s, v18.4s, %[scale].4s \n" /* mul */   \
+  "fmul v28.4s, v19.4s, %[scale].4s \n" /* mul */   \
+                                                    \
+  "fmul v29.4s, v8.4s, %[scale].4s \n"  /* mul */   \
+  "fmul v30.4s, v9.4s, %[scale].4s \n"  /* mul */   \
+  "fmul v31.4s, v12.4s, %[scale].4s \n" /* mul */   \
+                                                    \
+  "bif  v16.16b, v25.16b, v10.16b \n"   /* choose*/ \
+  "bif  v17.16b, v26.16b, v11.16b \n"   /* choose*/ \
+  "bif  v18.16b, v27.16b, v14.16b \n"   /* choose*/ \
+  "bif  v19.16b, v28.16b, v15.16b \n"   /* choose*/ \
+  "fmul v25.4s, v13.4s, %[scale].4s \n" /* mul */   \
+                                                    \
+  "bif  v8.16b, v29.16b, v21.16b \n"  /* choose*/   \
+  "bif  v9.16b, v30.16b, v22.16b \n"  /* choose*/   \
+  "bif  v12.16b, v31.16b, v23.16b \n" /* choose*/   \
+  "bif  v13.16b, v25.16b, v24.16b \n" /* choose*/
+
+#define NCHWC8_TRANS_FP32_STORE                          \
+  "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/    \
+  "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/    \
+  "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/    \
+  "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/    \
+                                                         \
+  "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/ \
+  "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/    \
+  "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/    \
+  "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/    \
+  "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/    \
+                                                         \
+  "bne    1b                      \n" /* jump to main loop*/
+
+#else
+#define NCHWC8_TRANS_FP32_COMPUTE                           \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"     \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"     \
+  "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"     \
+  "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"     \
+  "vmov.u32 q15, #0                      @ dump zero\n"     \
+  "1:                                    @ main loop\n"     \
+  "vtrn.32   q0, q2                      @ trans q0, q2 \n" \
+  "vtrn.32   q4, q6                      @ trans q4, q6 \n" \
+  "vswp.32   d1, d8                      @ swap  d1, d8 \n" \
+  "vswp.32   d5, d12                     @ swap  d5, d12\n" \
+                                                            \
+  "vtrn.32   q1, q3                      @ trans q1, q3 \n" \
+  "vtrn.32   q5, q7                      @ trans q5, q7 \n" \
+  "vswp.32   d3, d10                     @ swap  d3, d10\n" \
+  "vswp.32   d7, d14                     @ swap  d7, d14\n"
+
+#define NCHWC8_TRANS_FP32_RELU                     \
+  "vmax.f32  q0, q0, q15                 @ relu\n" \
+  "vmax.f32  q1, q1, q15                 @ relu\n" \
+  "vmax.f32  q2, q2, q15                 @ relu\n" \
+  "vmax.f32  q3, q3, q15                 @ relu\n" \
+                                                   \
+  "vmax.f32  q4, q4, q15                 @ relu\n" \
+  "vmax.f32  q5, q5, q15                 @ relu\n" \
+  "vmax.f32  q6, q6, q15                 @ relu\n" \
+  "vmax.f32  q7, q7, q15                 @ relu\n"
+
+#define NCHWC8_TRANS_FP32_RELU6                         \
+  "vmin.f32  q0, q0, %q[six]                 @ relu6\n" \
+  "vmin.f32  q1, q1, %q[six]                 @ relu6\n" \
+  "vmin.f32  q2, q2, %q[six]                 @ relu6\n" \
+  "vmin.f32  q3, q3, %q[six]                 @ relu6\n" \
+                                                        \
+  "vmin.f32  q4, q4, %q[six]                 @ relu6\n" \
+  "vmin.f32  q5, q5, %q[six]                 @ relu6\n" \
+  "vmin.f32  q6, q6, %q[six]                 @ relu6\n" \
+  "vmin.f32  q7, q7, %q[six]                 @ relu6\n"
+
+#define NCHWC8_TRANS_FP32_LEAKY_RELU           \
+  "vcge.f32   q9, q0,  q15        @ q0 > 0 \n" \
+  "vcge.f32   q10, q1, q15        @ q0 > 0 \n" \
+  "vcge.f32   q11, q2, q15        @ q0 > 0 \n" \
+  "vcge.f32   q12, q3, q15        @ q0 > 0 \n" \
+  "vmul.f32 q13, q0, %q[scale] \n"             \
+  "vmul.f32 q14, q1, %q[scale] \n"             \
+  "vmul.f32 q15, q2, %q[scale] \n"             \
+                                               \
+  "vbif q0, q13, q9 @ choose \n"               \
+  "vmul.f32 q9, q3, %q[scale] \n"              \
+                                               \
+  "vbif q1, q14, q10 @ choose \n"              \
+  "vbif q2, q15, q11 @ choose \n"              \
+  "vbif q3, q9, q12 @ choose \n"               \
+                                               \
+  "vcge.f32   q9, q4, q15        @ q0 > 0 \n"  \
+  "vcge.f32   q10, q5, q15        @ q0 > 0 \n" \
+  "vcge.f32   q11, q6, q15        @ q0 > 0 \n" \
+  "vcge.f32   q12, q7, q15        @ q0 > 0 \n" \
+  "vmul.f32 q13, q4, %q[scale] \n"             \
+  "vmul.f32 q14, q5, %q[scale] \n"             \
+  "vmul.f32 q15, q6, %q[scale] \n"             \
+                                               \
+  "vbif q4, q13, q9 @ choose \n"               \
+  "vmul.f32 q9, q7, %q[scale] \n"              \
+                                               \
+  "vbif q5, q14, q10 @ choose \n"              \
+  "vbif q6, q15, q11 @ choose \n"              \
+  "vbif q7, q9, q12 @ choose \n"
+
+#define NCHWC8_TRANS_FP32_STORE                                \
+  "subs   %[cnt], %[cnt], #1             @ loop count - 1\n"   \
+  "vst1.32   {d0-d1}, [%[doutc0r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d2-d3}, [%[doutc4r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d4-d5}, [%[doutc1r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d6-d7}, [%[doutc5r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+                                                               \
+  "vld1.32   {d0-d3}, [%[ptr_din]]!      @load data \n"        \
+  "vld1.32   {d4-d7}, [%[ptr_din]]!      @load data \n"        \
+                                                               \
+  "vst1.32   {d8-d9},   [%[doutc2r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d10-d11}, [%[doutc6r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d12-d13}, [%[doutc3r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d14-d15}, [%[doutc7r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+                                                               \
+  "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"        \
+  "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"        \
+                                                               \
+  "bne    1b                             @ jump to main loop\n"
+
+#endif
+// clang-format on
+inline void act_switch_c8_fp32(const float* din_ptr,
+                               float* doutc0_ptr,
+                               float* doutc1_ptr,
+                               float* doutc2_ptr,
+                               float* doutc3_ptr,
+                               float* doutc4_ptr,
+                               float* doutc5_ptr,
+                               float* doutc6_ptr,
+                               float* doutc7_ptr,
+                               int cnt_loop,
+                               const operators::ActivationParam* act_param) {
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t six = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t scale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    switch (act_param->active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU
+                         NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
+#else
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU
+                         NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
+#ifdef __aarch64__
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU6
+                         NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [six] "w"(six)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
+#else
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_RELU6 NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [six] "w"(six)
+                     : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_LEAKY_RELU
+                         NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [scale] "w"(scale)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22",
+                       "v23",
+                       "v24",
+                       "v25",
+                       "v26",
+                       "v27",
+                       "v28",
+                       "v29",
+                       "v30",
+                       "v31");
+#else
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_LEAKY_RELU
+                         NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [scale] "w"(scale)
+                     : "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param->active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [doutc2r0] "+r"(doutc2_ptr),
+                   [doutc3r0] "+r"(doutc3_ptr),
+                   [doutc4r0] "+r"(doutc4_ptr),
+                   [doutc5r0] "+r"(doutc5_ptr),
+                   [doutc6r0] "+r"(doutc6_ptr),
+                   [doutc7r0] "+r"(doutc7_ptr),
+                   [cnt] "+r"(cnt_loop),
+                   [ptr_din] "+r"(din_ptr)
+                 :
+                 : "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v12",
+                   "v13",
+                   "v14",
+                   "v15",
+                   "v16",
+                   "v17",
+                   "v18",
+                   "v19",
+                   "v20");
+#else
+    asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [doutc2r0] "+r"(doutc2_ptr),
+                   [doutc3r0] "+r"(doutc3_ptr),
+                   [doutc4r0] "+r"(doutc4_ptr),
+                   [doutc5r0] "+r"(doutc5_ptr),
+                   [doutc6r0] "+r"(doutc6_ptr),
+                   [doutc7r0] "+r"(doutc7_ptr),
+                   [ptr_din] "+r"(din_ptr),
+                   [cnt] "+r"(cnt_loop)
+                 :
+                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15");
+#endif
+  }
+}
+
+#ifdef __aarch64__
+#define LOAD_DATA                                               \
+  "1:                               \n"                         \
+  "ld1 {v0.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v1.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v2.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v3.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/
+#define DO_RELU                                           \
+  "fmax v0.4s, v0.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v1.4s, v1.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v2.4s, v2.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v3.4s, v3.4s, %[vzero].4s   \n" /* vmaxq_f32() */
+#define DO_RELU6                                         \
+  "fmin v0.4s, v0.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v1.4s, v1.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v2.4s, v2.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v3.4s, v3.4s, %[vsix].4s   \n" /* vmaxq_f32() */
+#define DO_LEAKY_RELU                                    \
+  "fcmge v4.4s, v0.4s,  %[vzero].4s  \n" /* vcgeq_f32 */ \
+  "fmul v5.4s, v0.4s, %[vscale].4s   \n" /* vmulq_f32 */ \
+  "fcmge v6.4s, v1.4s,  %[vzero].4s  \n" /* vcgeq_f32 */ \
+  "fmul v7.4s, v1.4s, %[vscale].4s   \n" /* vmulq_f32 */ \
+  "fcmge v8.4s, v2.4s,  %[vzero].4s  \n" /* vcgeq_f32 */ \
+  "fmul v9.4s, v2.4s, %[vscale].4s   \n" /* vmulq_f32 */ \
+  "fcmge v10.4s, v3.4s,  %[vzero].4s \n" /* vcgeq_f32 */ \
+  "fmul v11.4s, v3.4s, %[vscale].4s  \n" /* vmulq_f32 */ \
+  "bif v0.16b, v5.16b, v4.16b        \n" /* choose*/     \
+  "bif v1.16b, v7.16b, v6.16b        \n" /* choose*/     \
+  "bif v2.16b, v9.16b, v8.16b        \n" /* choose*/     \
+  "bif v3.16b, v11.16b, v10.16b      \n" /* choose*/
+#define DO_STORE                                         \
+  "subs %w[cnt], %w[cnt], #1                    \n"      \
+  "st1 {v0.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v1.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v2.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v3.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "bne  1b                                    \n"
+#else
+#define LOAD_DATA                                            \
+  "1:                               \n"                      \
+  "vld1.32 {d6-d7}, [%[din_ptr]]!   @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d8-d9}, [%[din_ptr]]!   @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d10-d11}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d12-d13}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n"
+#define DO_RELU                                 \
+  "vmax.f32 q3, q3, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q4, q4, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q5, q5, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q6, q6, %q[vzero] @ vmaxq_f32() \n"
+#define DO_RELU6                               \
+  "vmin.f32 q3, q3, %q[vsix] @ vminq_f32() \n" \
+  "vmin.f32 q4, q4, %q[vsix] @ vmaxq_f32() \n" \
+  "vmin.f32 q5, q5, %q[vsix] @ vmaxq_f32() \n" \
+  "vmin.f32 q6, q6, %q[vsix] @ vmaxq_f32() \n"
+#define DO_LEAKY_RELU                            \
+  "vcge.f32 q7, q3, %q[vzero]   @ vcgeq_u32 \n"  \
+  "vmul.f32 q8, q3, %q[vscale]  @ vmulq_f32 \n"  \
+  "vcge.f32 q9, q4, %q[vzero]   @ vcgeq_u32 \n"  \
+  "vmul.f32 q10, q4, %q[vscale]  @ vmulq_f32 \n" \
+  "vcge.f32 q11, q5, %q[vzero]   @ vcgeq_u32 \n" \
+  "vmul.f32 q12, q5, %q[vscale]  @ vmulq_f32 \n" \
+  "vcge.f32 q13, q6, %q[vzero]   @ vcgeq_u32 \n" \
+  "vmul.f32 q14, q6, %q[vscale]  @ vmulq_f32 \n" \
+  "vbif q3, q8, q7               @ choose \n"    \
+  "vbif q4, q10, q9              @ choose \n"    \
+  "vbif q5, q12, q11             @ choose \n"    \
+  "vbif q6, q14, q13             @ choose \n"
+#define DO_STORE                                            \
+  "subs %[cnt], #1                                \n"       \
+  "vst1.32 {d6-d7}, [%[dout_ptr]]!       @ vst1q_f32()  \n" \
+  "vst1.32 {d8-d9}, [%[dout_ptr]]!       @ vst1q_f32()  \n" \
+  "vst1.32 {d10-d11}, [%[dout_ptr]]!     @ vst1q_f32()  \n" \
+  "vst1.32 {d12-d13}, [%[dout_ptr]]!     @ vst1q_f32()  \n" \
+  "bne  1b                                    \n"
+#endif
+/*
+* Data do activation process
+* Now support relu relu6 leakyrelu act
+*/
+inline void act_switch_process(float* src,
+                               float* dst,
+                               int size,
+                               const operators::ActivationParam* act_param) {
+  int cnt = size >> 4;
+  int remain = size % 16;
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  if (act_param != nullptr) {
+    float32x4_t vsix = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    if (cnt > 0) {
+      switch (act_param->active_type) {
+        case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+          asm volatile(
+              LOAD_DATA DO_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero)
+              : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+          asm volatile(
+              LOAD_DATA DO_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero)
+              : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+          break;
+        case lite_api::ActivationType::kRelu6:
+#ifdef __aarch64__
+          asm volatile(
+              LOAD_DATA DO_RELU DO_RELU6 DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vsix] "w"(vsix)
+              : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+          asm volatile(
+              LOAD_DATA DO_RELU DO_RELU6 DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vsix] "w"(vsix)
+              : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+          break;
+        case lite_api::ActivationType::kLeakyRelu:
+#ifdef __aarch64__
+          asm volatile(
+              LOAD_DATA DO_LEAKY_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vscale] "w"(vscale)
+              : "memory",
+                "cc",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11");
+#else
+          asm volatile(
+              LOAD_DATA DO_LEAKY_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vscale] "w"(vscale)
+              : "memory",
+                "cc",
+                "q3",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11",
+                "q12",
+                "q13",
+                "q14");
+#endif
+          break;
+        default:
+          LOG(FATAL) << "this act_type: "
+                     << static_cast<int>(act_param->active_type)
+                     << " fuse not support";
+      }
+    }
+    // remain
+    switch (act_param->active_type) {
+      case lite_api::ActivationType::kRelu:
+        for (int i = 0; i < remain; i++) {
+          *dst = *src >= 0.f ? *src : 0.f;
+          src++;
+          dst++;
+        }
+        break;
+      case lite_api::ActivationType::kRelu6:
+        for (int i = 0; i < remain; i++) {
+          float tmp = *src >= 0.f ? *src : 0.f;
+          *dst = tmp <= act_param->Relu_clipped_coef
+                     ? tmp
+                     : act_param->Relu_clipped_coef;
+          src++;
+          dst++;
+        }
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        for (int i = 0; i < remain; i++) {
+          if (*src >= 0.f) {
+            *dst = *src;
+          } else {
+            *dst = *src * act_param->Leaky_relu_alpha;
+          }
+          src++;
+          dst++;
+        }
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param->active_type)
+                   << " fuse not support";
+    }
+  }
+}
 
 /*wirte result in outputs
 * input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w]
@@ -1199,7 +2374,8 @@ inline bool write_to_output_c8_fp32(const float* din,
                                     int height,
                                     int width,
                                     bool flag_relu,
-                                    float* trash_ptr) {
+                                    float* trash_ptr,
+                                    operators::ActivationParam* act_param) {
   if (ch_n != 8 || hei_n <= 0) {
     LOG(ERROR) << "ch_n must be equal 8 and hei_n is more than zero";
     return false;
@@ -1220,392 +2396,161 @@ inline bool write_to_output_c8_fp32(const float* din,
   int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
 
   int valid_w = we - ws;
+  int w4 = 4;
   int cnt = valid_w / 4;
 
   if (we > width) {
     cnt--;
   }
-  if (flag_relu) {
-    for (int i = 0; i < size_h; i++) {
-      int size_w = i * width;
-      float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-      float* doutc1_ptr = doutc1r0 + size_w;
-      float* doutc2_ptr = doutc2r0 + size_w;
-      float* doutc3_ptr = doutc3r0 + size_w;
-      float* doutc4_ptr = doutc4r0 + size_w;
-      float* doutc5_ptr = doutc5r0 + size_w;
-      float* doutc6_ptr = doutc6r0 + size_w;
-      float* doutc7_ptr = doutc7r0 + size_w;
-      if (ce > channel) {
-        switch (ce - channel) {
-          case 7:
-            doutc1_ptr = trash_ptr;
-          case 6:
-            doutc2_ptr = trash_ptr;
-          case 5:
-            doutc3_ptr = trash_ptr;
-          case 4:
-            doutc4_ptr = trash_ptr;
-          case 3:
-            doutc5_ptr = trash_ptr;
-          case 2:
-            doutc6_ptr = trash_ptr;
-          case 1:
-            doutc7_ptr = trash_ptr;
-          default:
-            break;
-        }
-      }
-      ptr_din = din + i * valid_w * ch_n;
-      const float* din_hei_ptr = ptr_din;
-      if (cnt > 0) {
-        int cnt_loop = cnt;
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-            "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-            "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-            "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-            "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-            "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-            "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/
-            "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/
-            "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/
-            "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
-
-            "fmax   v8.4s,  v8.4s,  v20.4s  \n" /*relu*/
-            "fmax   v9.4s,  v9.4s,  v20.4s  \n" /*relu*/
-            "fmax   v12.4s, v12.4s, v20.4s  \n" /*relu*/
-            "fmax   v13.4s, v13.4s, v20.4s  \n" /*relu*/
-
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/
-            "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/
-            "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-            "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-            "vmov.u32 q15, #0                      @ dump zero\n"
-            "1:                                    @ main loop\n"
-            "vtrn.32   q0, q2                      @ trans q0, q2 \n"
-            "vtrn.32   q4, q6                      @ trans q4, q6 \n"
-            "vswp.32   d1, d8                      @ swap  d1, d8 \n"
-            "vswp.32   d5, d12                     @ swap  d5, d12\n"
-
-            "vtrn.32   q1, q3                      @ trans q1, q3 \n"
-            "vtrn.32   q5, q7                      @ trans q5, q7 \n"
-            "vswp.32   d3, d10                     @ swap  d3, d10\n"
-            "vswp.32   d7, d14                     @ swap  d7, d14\n"
-
-            "vmax.f32  q0, q0, q15                 @ relu\n"
-            "vmax.f32  q1, q1, q15                 @ relu\n"
-            "vmax.f32  q2, q2, q15                 @ relu\n"
-            "vmax.f32  q3, q3, q15                 @ relu\n"
-
-            "vmax.f32  q4, q4, q15                 @ relu\n"
-            "vmax.f32  q5, q5, q15                 @ relu\n"
-            "vmax.f32  q6, q6, q15                 @ relu\n"
-            "vmax.f32  q7, q7, q15                 @ relu\n"
-
-            "subs   %[cnt], %[cnt], #1             @ loop count - 1\n"
-            "vst1.32   {d0-d1}, [%[doutc0r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d2-d3}, [%[doutc4r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d4-d5}, [%[doutc1r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d6-d7}, [%[doutc5r0]]!     @ store result, add "
-            "pointer\n"
-
-            "vld1.32   {d0-d3}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d4-d7}, [%[ptr_din]]!      @load data \n"
-
-            "vst1.32   {d8-d9},   [%[doutc2r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d10-d11}, [%[doutc6r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d12-d13}, [%[doutc3r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d14-d15}, [%[doutc7r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-
-            "bne    1b                             @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q15");
-#endif
-      }
-      if (we > width) {
-        int offset = 32 * (valid_w / 4 - 1);
-        din_hei_ptr = ptr_din + offset;
-        int i = we - 4;
-        for (; i < width; ++i) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
-          *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
-          *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f);
-          *(doutc4_ptr++) = LITEMAX(din_hei_ptr[4], 0.f);
-          *(doutc5_ptr++) = LITEMAX(din_hei_ptr[5], 0.f);
-          *(doutc6_ptr++) = LITEMAX(din_hei_ptr[6], 0.f);
-          *(doutc7_ptr++) = LITEMAX(din_hei_ptr[7], 0.f);
-          din_hei_ptr += 8;
-        }
+  for (int i = 0; i < size_h; i++) {
+    int size_w = i * width;
+    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
+    float* doutc1_ptr = doutc1r0 + size_w;
+    float* doutc2_ptr = doutc2r0 + size_w;
+    float* doutc3_ptr = doutc3r0 + size_w;
+    float* doutc4_ptr = doutc4r0 + size_w;
+    float* doutc5_ptr = doutc5r0 + size_w;
+    float* doutc6_ptr = doutc6r0 + size_w;
+    float* doutc7_ptr = doutc7r0 + size_w;
+    if (ce > channel) {
+      switch (ce - channel) {
+        case 7:
+          doutc1_ptr = trash_ptr;
+        case 6:
+          doutc2_ptr = trash_ptr;
+        case 5:
+          doutc3_ptr = trash_ptr;
+        case 4:
+          doutc4_ptr = trash_ptr;
+        case 3:
+          doutc5_ptr = trash_ptr;
+        case 2:
+          doutc6_ptr = trash_ptr;
+        case 1:
+          doutc7_ptr = trash_ptr;
+        default:
+          break;
       }
     }
-  } else {
-    for (int i = 0; i < size_h; i++) {
-      int size_w = i * width;
-      float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-      float* doutc1_ptr = doutc1r0 + size_w;
-      float* doutc2_ptr = doutc2r0 + size_w;
-      float* doutc3_ptr = doutc3r0 + size_w;
-      float* doutc4_ptr = doutc4r0 + size_w;
-      float* doutc5_ptr = doutc5r0 + size_w;
-      float* doutc6_ptr = doutc6r0 + size_w;
-      float* doutc7_ptr = doutc7r0 + size_w;
-      if (ce > channel) {
-        switch (ce - channel) {
-          case 7:
-            doutc1_ptr = trash_ptr;
-          case 6:
-            doutc2_ptr = trash_ptr;
-          case 5:
-            doutc3_ptr = trash_ptr;
-          case 4:
-            doutc4_ptr = trash_ptr;
-          case 3:
-            doutc5_ptr = trash_ptr;
-          case 2:
-            doutc6_ptr = trash_ptr;
-          case 1:
-            doutc7_ptr = trash_ptr;
-          default:
+    ptr_din = din + i * valid_w * ch_n;
+    const float* din_hei_ptr = ptr_din;
+    if (cnt > 0) {
+      int cnt_loop = cnt;
+      act_switch_c8_fp32(din_hei_ptr,
+                         doutc0_ptr,
+                         doutc1_ptr,
+                         doutc2_ptr,
+                         doutc3_ptr,
+                         doutc4_ptr,
+                         doutc5_ptr,
+                         doutc6_ptr,
+                         doutc7_ptr,
+                         cnt_loop,
+                         act_param);
+    }
+    if (we > width) {
+      int offset = 32 * (valid_w / 4 - 1);
+      din_hei_ptr = ptr_din + offset;
+      doutc0_ptr += w4 * cnt;
+      doutc1_ptr += w4 * cnt;
+      doutc2_ptr += w4 * cnt;
+      doutc3_ptr += w4 * cnt;
+      doutc4_ptr += w4 * cnt;
+      doutc5_ptr += w4 * cnt;
+      doutc6_ptr += w4 * cnt;
+      doutc7_ptr += w4 * cnt;
+      int i = we - 4;
+      if (act_param != nullptr && act_param->has_active) {
+        float six = act_param->Relu_clipped_coef;
+        float scale = act_param->Leaky_relu_alpha;
+        switch (act_param->active_type) {
+          case lite_api::ActivationType::kRelu:
+            for (; i < width; ++i) {
+              *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
+              *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
+              *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
+              *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f);
+              *(doutc4_ptr++) = LITEMAX(din_hei_ptr[4], 0.f);
+              *(doutc5_ptr++) = LITEMAX(din_hei_ptr[5], 0.f);
+              *(doutc6_ptr++) = LITEMAX(din_hei_ptr[6], 0.f);
+              *(doutc7_ptr++) = LITEMAX(din_hei_ptr[7], 0.f);
+              din_hei_ptr += 8;
+            }
+            break;
+          case lite_api::ActivationType::kRelu6:
+            /* 0 <= din <= 6 */
+            for (; i < width; ++i) {
+              float tmp1 = LITEMAX(din_hei_ptr[0], 0.f);
+              float tmp2 = LITEMAX(din_hei_ptr[1], 0.f);
+              float tmp3 = LITEMAX(din_hei_ptr[2], 0.f);
+              float tmp4 = LITEMAX(din_hei_ptr[3], 0.f);
+              float tmp5 = LITEMAX(din_hei_ptr[4], 0.f);
+              float tmp6 = LITEMAX(din_hei_ptr[5], 0.f);
+              float tmp7 = LITEMAX(din_hei_ptr[6], 0.f);
+              float tmp8 = LITEMAX(din_hei_ptr[7], 0.f);
+              *(doutc0_ptr++) = LITEMIN(tmp1, six);
+              *(doutc1_ptr++) = LITEMIN(tmp2, six);
+              *(doutc2_ptr++) = LITEMIN(tmp3, six);
+              *(doutc3_ptr++) = LITEMIN(tmp4, six);
+              *(doutc4_ptr++) = LITEMIN(tmp5, six);
+              *(doutc5_ptr++) = LITEMIN(tmp6, six);
+              *(doutc6_ptr++) = LITEMIN(tmp7, six);
+              *(doutc7_ptr++) = LITEMIN(tmp8, six);
+              din_hei_ptr += 8;
+            }
+            break;
+          case lite_api::ActivationType::kLeakyRelu:
+            /*din = din >= 0 ? din : din * scale*/
+            for (; i < width; ++i) {
+              if (din_hei_ptr[0] >= 0) {
+                *(doutc0_ptr++) = din_hei_ptr[0];
+              } else {
+                *(doutc0_ptr++) = din_hei_ptr[0] * scale;
+              }
+              if (din_hei_ptr[1] >= 0) {
+                *(doutc1_ptr++) = din_hei_ptr[1];
+              } else {
+                *(doutc1_ptr++) = din_hei_ptr[1] * scale;
+              }
+              if (din_hei_ptr[2] >= 0) {
+                *(doutc2_ptr++) = din_hei_ptr[2];
+              } else {
+                *(doutc2_ptr++) = din_hei_ptr[2] * scale;
+              }
+              if (din_hei_ptr[3] >= 0) {
+                *(doutc3_ptr++) = din_hei_ptr[3];
+              } else {
+                *(doutc3_ptr++) = din_hei_ptr[3] * scale;
+              }
+              if (din_hei_ptr[4] >= 0) {
+                *(doutc4_ptr++) = din_hei_ptr[4];
+              } else {
+                *(doutc4_ptr++) = din_hei_ptr[4] * scale;
+              }
+              if (din_hei_ptr[4] >= 0) {
+                *(doutc5_ptr++) = din_hei_ptr[5];
+              } else {
+                *(doutc5_ptr++) = din_hei_ptr[5] * scale;
+              }
+              if (din_hei_ptr[6] >= 0) {
+                *(doutc6_ptr++) = din_hei_ptr[6];
+              } else {
+                *(doutc6_ptr++) = din_hei_ptr[6] * scale;
+              }
+              if (din_hei_ptr[7] >= 0) {
+                *(doutc7_ptr++) = din_hei_ptr[7];
+              } else {
+                *(doutc7_ptr++) = din_hei_ptr[7] * scale;
+              }
+              din_hei_ptr += 8;
+            }
             break;
+          default:
+            LOG(FATAL) << "this act_type: "
+                       << static_cast<int>(act_param->active_type)
+                       << " fuse not support";
         }
-      }
-      ptr_din = din + i * valid_w * ch_n;
-      const float* din_hei_ptr = ptr_din;
-      if (cnt > 0) {
-        int cnt_loop = cnt;
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-            "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-            "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-            "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-            "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-            "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-            "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/
-            "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/
-            "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-            "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-#else
-        asm volatile(
-            "vld1.32   {d0-d3}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d4-d7}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d8-d11}, [%[ptr_din]]!     @load data \n"
-            "vld1.32   {d12-d15}, [%[ptr_din]]!    @load data \n"
-            "1:                                    @ main loop\n"
-            "vtrn.32   q0, q2                      @ trans q0, q2 \n"
-            "vtrn.32   q4, q6                      @ trans q4, q6 \n"
-            "vswp.32   d1, d8                      @ swap  d1, d8 \n"
-            "vswp.32   d5, d12                     @ swap  d5, d12\n"
-
-            "vtrn.32   q1, q3                      @ trans q1, q3 \n"
-            "vtrn.32   q5, q7                      @ trans q5, q7 \n"
-            "vswp.32   d3, d10                     @ swap  d3, d10\n"
-            "vswp.32   d7, d14                     @ swap  d7, d14\n"
-
-            "subs      %[cnt], %[cnt], #1          @ loop count - 1\n"
-
-            "vst1.32   {d0-d1},   [%[doutc0r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d2-d3},   [%[doutc4r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d4-d5},   [%[doutc1r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d6-d7},   [%[doutc5r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32   {d0-d3},   [%[ptr_din]]!    @load data \n"
-            "vld1.32   {d4-d7},   [%[ptr_din]]!    @load data \n"
-
-            "vst1.32   {d8-d9},   [%[doutc2r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d10-d11}, [%[doutc6r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d12-d13}, [%[doutc3r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d14-d15}, [%[doutc7r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32 {d8-d11},  [%[ptr_din]]!      @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-
-            "bne    1b                             @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4");
-#endif
-      }
-      if (we > width) {
-        int offset = 32 * (valid_w / 4 - 1);
-        din_hei_ptr = ptr_din + offset;
-        int i = we - 4;
+      } else {
         for (; i < width; ++i) {
           *(doutc0_ptr++) = din_hei_ptr[0];
           *(doutc1_ptr++) = din_hei_ptr[1];
diff --git a/lite/backends/arm/math/conv_depthwise.h b/lite/backends/arm/math/conv_depthwise.h
index 1a23982cd575afb6b249390de7081165c03414b9..4c5f284a19f615382ea04904184427f569f95ff3 100644
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -37,6 +37,7 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
                                const float* weights,
                                const float* bias,
                                const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
                                ARMContext* ctx);
 
 void conv_3x3s2_depthwise_fp32(const float* i_data,
@@ -51,6 +52,7 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
                                const float* weights,
                                const float* bias,
                                const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
                                ARMContext* ctx);
 
 void conv_depthwise_3x3s1_fp32(const float* din,
@@ -66,7 +68,7 @@ void conv_depthwise_3x3s1_fp32(const float* din,
                                const float* bias,
                                int pad,
                                bool flag_bias,
-                               bool flag_relu,
+                               const operators::ActivationParam act_param,
                                ARMContext* ctx);
 
 void conv_depthwise_3x3s2_fp32(const float* din,
@@ -82,39 +84,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                const float* bias,
                                int pad,
                                bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx);
-
-void conv_depthwise_3x3p0_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int stride,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx);
-
-void conv_depthwise_3x3p1_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int stride,
-                               bool flag_bias,
-                               bool flag_relu,
+                               const operators::ActivationParam act_param,
                                ARMContext* ctx);
 
 template <typename Dtype>
@@ -153,20 +123,21 @@ void conv_depthwise_3x3s2_int8(Dtype* dout,
                                int padh,
                                ARMContext* ctx);
 
-void conv_depthwise_5x5s1_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int chout,
-                               int hout,
-                               int wout,
-                               int chin,
-                               int hin,
-                               int win,
+void conv_depthwise_5x5s1_fp32(float* dout,
+                               const float* din,
                                const float* weights,
                                const float* bias,
-                               int pad,
                                bool flag_bias,
                                bool flag_relu,
+                               int num,
+                               int chin,
+                               int hin,
+                               int win,
+                               int hout,
+                               int wout,
+                               int padw,
+                               int padh,
+                               const operators::ConvParam& param,
                                ARMContext* ctx);
 
 void conv_depthwise_5x5s2_fp32(const float* din,
@@ -180,13 +151,46 @@ void conv_depthwise_5x5s2_fp32(const float* din,
                                int win,
                                const float* weights,
                                const float* bias,
-                               int pad,
+                               const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
+                               ARMContext* ctx);
+
+void conv_depthwise_5x5s2p2_fp32(const float* din,
+                                 float* dout,
+                                 int num,
+                                 int chout,
+                                 int hout,
+                                 int wout,
+                                 int chin,
+                                 int hin,
+                                 int win,
+                                 const float* weights,
+                                 const float* bias,
+                                 int pad,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 ARMContext* ctx);
+
+template <typename Dtype>
+void conv_depthwise_5x5s1_int8(Dtype* dout,
+                               const int8_t* din,
+                               const int8_t* weights,
+                               const float* scale,
+                               const float* bias,
                                bool flag_bias,
                                bool flag_relu,
+                               int num,
+                               int chin,
+                               int hin,
+                               int win,
+                               int hout,
+                               int wout,
+                               int padw,
+                               int padh,
                                ARMContext* ctx);
 
 template <typename Dtype>
-void conv_depthwise_5x5s1_int8(Dtype* dout,
+void conv_depthwise_5x5s2_int8(Dtype* dout,
                                const int8_t* din,
                                const int8_t* weights,
                                const float* scale,
diff --git a/lite/backends/arm/math/conv_depthwise_3x3p0.cc b/lite/backends/arm/math/conv_depthwise_3x3p0.cc
deleted file mode 100644
index 0c050ffe6fb0f064f5c26ea0da6acee17f4403ae..0000000000000000000000000000000000000000
--- a/lite/backends/arm/math/conv_depthwise_3x3p0.cc
+++ /dev/null
@@ -1,4178 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_depthwise_3x3s1p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s2p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s1p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3s2p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3p0_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int stride,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx) {
-  if (stride == 1) {
-    if (flag_relu) {
-      if (w_in > 5) {
-        conv_depthwise_3x3s1p0_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s1p0_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 5) {
-        conv_depthwise_3x3s1p0_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s1p0_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  } else {  //! stride = 2
-    if (flag_relu) {
-      if (w_in > 8) {
-        conv_depthwise_3x3s2p0_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s2p0_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 8) {
-        conv_depthwise_3x3s2p0_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s2p0_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-// 4line
-void conv_depthwise_3x3s1p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = w_out >> 2;
-  int remain = w_out % 4;
-
-  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
-  const int remian_idx[4] = {0, 1, 2, 3};
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-      // wr0 = vsetq_lane_f32(0.f, wr0, 3);
-      // wr1 = vsetq_lane_f32(0.f, wr1, 3);
-      // wr2 = vsetq_lane_f32(0.f, wr2, 3);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_out; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr5;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 >= h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            case 0:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = tile_w;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */
-
-            // mid
-            // "cmp  %[cnt], #1                \n"
-            // "blt 5f                         \n"
-            "4:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"      /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v1.4s}, [%[din_ptr0]]        \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 4b \n"
-
-            // right
-            "5:                             \n"
-            "cmp  %[remain], #1             \n"
-            "blt 0f                         \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v16 = 2345 */
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            // end
-            "0:                             \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_out; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-
-        dr0 = dr2;
-        dr1 = dr3;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        //! process bottom pad
-        if (i + 3 >= h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            case 0:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = tile_w;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r3\n"
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "cmp %[remain], #1                             @ check whether has "
-            "mid cols\n"
-            "blt  0f                                @ jump to main loop start "
-            "point\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-            "0:                         \n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2
- */
-// w_in > 7
-void conv_depthwise_3x3s2p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-
-  int tile_w = w_out >> 2;
-  int cnt_remain = w_out % 4;
-
-  unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_out; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 >= h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            case 0:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = tile_w;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-                                                        // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"      // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-      for (int i = 0; i < h_out; i++) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = tile_w;
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-            // mid
-            "2:                                             \n"
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "subs %[cnt], #1                                \n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-
-// 4line
-void conv_depthwise_3x3s1p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = w_out >> 2;
-  int remain = w_out % 4;
-
-  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
-  const int remian_idx[4] = {0, 1, 2, 3};
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-      // wr0 = vsetq_lane_f32(0.f, wr0, 3);
-      // wr1 = vsetq_lane_f32(0.f, wr1, 3);
-      // wr2 = vsetq_lane_f32(0.f, wr2, 3);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_out; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr5;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 >= h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            case 0:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = tile_w;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */
-
-            // mid
-            "4:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"      /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v1.4s}, [%[din_ptr0]]        \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v13.4s}, [%[bias_val]]      \n"   /*vdupq_n_f32(bias_val)*/
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 4b \n"
-
-            // right
-            "5:                             \n"
-            "cmp  %[remain], #1             \n"
-            "blt 0f                         \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v16 = 2345 */
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            // end
-            "0:                             \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_out; i += 2) {
-        //! process top pad pad_h = 1
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-
-        dr0 = dr2;
-        dr1 = dr3;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        //! process bottom pad
-        if (i + 3 >= h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            case 0:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = tile_w;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r3\n"
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32 q4, q4, %q[vzero]          @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vmax.f32 q5, q5, %q[vzero]          @ relu \n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "cmp %[remain], #1                             @ check whether has "
-            "mid cols\n"
-            "blt  0f                                @ jump to main loop start "
-            "point\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmax.f32 q4, q4, %q[vzero]          @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmax.f32 q5, q5, %q[vzero]          @ relu \n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-            "0:                         \n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, with reulu
- */
-// w_in > 7
-void conv_depthwise_3x3s2p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-
-  int tile_w = w_out >> 2;
-  int cnt_remain = w_out % 4;
-
-  unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_out; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 >= h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            case 0:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = tile_w;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-                                                        // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"       // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"   // v10 = vbias
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-      for (int i = 0; i < h_out; i++) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = tile_w;
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-            // mid
-            "2:                                             \n"
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "subs %[cnt], #1                                \n"
-            "vmax.f32 q3, q3, q9                     @ relu \n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                     @ relu \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp1 =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
-  uint32x4_t vmask_rp2 =
-      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_out; j += 2) {
-        const float* dr0 = din_channel + j * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        doutr0 = dout_channel + j * w_out;
-        doutr1 = doutr0 + w_out;
-
-        if (j + 3 >= h_in) {
-          switch (j + 3 - h_in) {
-            case 3:
-              dr1 = zero_ptr;
-            case 2:
-              dr2 = zero_ptr;
-            case 1:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            case 0:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            default:
-              break;
-          }
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s, v1.4s}, [%[din0]]\n"
-            "ld1 {v2.4s, v3.4s}, [%[din1]]\n"
-            "ld1 {v4.4s, v5.4s}, [%[din2]]\n"
-            "ld1 {v6.4s, v7.4s}, [%[din3]]\n"
-
-            "bif v0.16b, %[zero].16b, %[mask1].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask2].16b\n"  // d0_1234
-
-            "bif v2.16b, %[zero].16b, %[mask1].16b\n"  // d1_1234
-            "bif v3.16b, %[zero].16b, %[mask2].16b\n"  // d1_1234
-
-            "bif v4.16b, %[zero].16b, %[mask1].16b\n"  // d2_1234
-            "bif v5.16b, %[zero].16b, %[mask2].16b\n"  // d2_1234
-
-            "bif v6.16b, %[zero].16b, %[mask1].16b\n"  // d3_1234
-            "bif v7.16b, %[zero].16b, %[mask2].16b\n"  // d3_1234
-
-            "ext v8.16b, v0.16b, v1.16b, #4\n"  // d1_2345
-            "ext v9.16b, v0.16b, v1.16b, #8\n"  // d1_3450
-
-            "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"  // v12 = vbias
-            "and  v13.16b, %[vbias].16b, %[vbias].16b  \n"  // v13 = vbias
-
-            // r0
-            "fmul v10.4s, v0.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmul v11.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v12.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v2.16b, v3.16b, #4\n"  // d1_2345
-            "ext v9.16b, v2.16b, v3.16b, #8\n"  // d1_3450
-
-            // r1
-            "fmul v14.4s, v2.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v2.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmul v15.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v4.16b, v5.16b, #4\n"  // d1_2345
-            "ext v9.16b, v4.16b, v5.16b, #8\n"  // d1_3450
-
-            // r2
-            "fmla v14.4s, v4.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v4.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v6.16b, v7.16b, #4\n"  // d1_2345
-            "ext v9.16b, v6.16b, v7.16b, #8\n"  // d1_3450
-
-            // r3
-            "fmla v14.4s, v6.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fadd v12.4s, v12.4s, v10.4s\n"
-
-            "fmla v13.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "fadd v12.4s, v12.4s, v11.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v14.4s\n"  // out2
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out2
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vbias] "w"(wbias),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [zero] "w"(vzero),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-#else
-        unsigned int* vmask_ptr = vmask;
-        float bias_val = flag_bias ? bias[i] : 0.f;
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-            "vadd.f32 q4, q4, q10         @ q4 += q10 \n"
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-            "vadd.f32 q4, q4, q11         @ q4 += q10 \n"
-
-            "vadd.f32 q5, q5, q8         @ q4 += q10 \n"
-            "vadd.f32 q5, q5, q9         @ q4 += q10 \n"
-
-            "vst1.32  {d8-d9},   [%[out1]]  @ store result, add pointer\n"
-            "vst1.32  {d10-d11},   [%[out2]]  @ store result, add pointer\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vzero] "w"(vzero),
-              [bias_val] "r"(bias_val),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
- */
-
-void conv_depthwise_3x3s2p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      float out_buf[4];
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      for (int j = 0; j < h_out; ++j) {
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-            "and  v4.16b, %[bias].16b, %[bias].16b  \n"  // v10 = vbias
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v10.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v7.16b, v12.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v8.16b, v14.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-
-            "fmla v4.4s, v10.4s, %[wr0].s[0]                \n"   // 0246 * w00
-            "fmul v5.4s, v11.4s, %[wr0].s[1]                \n"   // 1357 * w01
-            "fmul v16.4s, v6.4s,  %[wr0].s[2]                \n"  // 2468  * w02
-
-            "fmla v4.4s, v12.4s, %[wr1].s[0]                \n"   // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[1]                \n"   // v13 * w12
-            "fmla v16.4s, v7.4s,  %[wr1].s[2]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[0]                \n"   // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[1]                \n"   // v15 * w21
-            "fmla v16.4s, v8.4s,  %[wr2].s[2]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v16.4s                       \n"
-
-            // "fadd v4.4s, v4.4s, %[bias].4s                  \n"
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,0}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q7 = {2,4,6,0}
-            "vext.32 q8, q14, q9, #1                        @ shift left 1 \n"  // q8 = {2,4,6,0}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {0,2,4,6}
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // {1,3,5,7}
-            "vmla.f32 q3, q6,  %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {2,4,6,0}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf),
-              [mask_ptr] "r"(dmask)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp1 =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
-  uint32x4_t vmask_rp2 =
-      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_out; j += 2) {
-        const float* dr0 = din_channel + j * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        doutr0 = dout_channel + j * w_out;
-        doutr1 = doutr0 + w_out;
-
-        if (j + 3 >= h_in) {
-          switch (j + 3 - h_in) {
-            case 3:
-              dr1 = zero_ptr;
-            case 2:
-              dr2 = zero_ptr;
-            case 1:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            case 0:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            default:
-              break;
-          }
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s, v1.4s}, [%[din0]]\n"
-            "ld1 {v2.4s, v3.4s}, [%[din1]]\n"
-            "ld1 {v4.4s, v5.4s}, [%[din2]]\n"
-            "ld1 {v6.4s, v7.4s}, [%[din3]]\n"
-
-            "bif v0.16b, %[zero].16b, %[mask1].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask2].16b\n"  // d0_1234
-
-            "bif v2.16b, %[zero].16b, %[mask1].16b\n"  // d1_1234
-            "bif v3.16b, %[zero].16b, %[mask2].16b\n"  // d1_1234
-
-            "bif v4.16b, %[zero].16b, %[mask1].16b\n"  // d2_1234
-            "bif v5.16b, %[zero].16b, %[mask2].16b\n"  // d2_1234
-
-            "bif v6.16b, %[zero].16b, %[mask1].16b\n"  // d3_1234
-            "bif v7.16b, %[zero].16b, %[mask2].16b\n"  // d3_1234
-
-            "ext v8.16b, v0.16b, v1.16b, #4\n"  // d1_2345
-            "ext v9.16b, v0.16b, v1.16b, #8\n"  // d1_3450
-
-            "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"  // v12 = vbias
-            "and  v13.16b, %[vbias].16b, %[vbias].16b  \n"  // v13 = vbias
-
-            // r0
-            "fmul v10.4s, v0.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmul v11.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v12.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v2.16b, v3.16b, #4\n"  // d1_2345
-            "ext v9.16b, v2.16b, v3.16b, #8\n"  // d1_3450
-
-            // r1
-            "fmul v14.4s, v2.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v2.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmul v15.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v4.16b, v5.16b, #4\n"  // d1_2345
-            "ext v9.16b, v4.16b, v5.16b, #8\n"  // d1_3450
-
-            // r2
-            "fmla v14.4s, v4.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v4.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v6.16b, v7.16b, #4\n"  // d1_2345
-            "ext v9.16b, v6.16b, v7.16b, #8\n"  // d1_3450
-
-            // r3
-            "fmla v14.4s, v6.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fadd v12.4s, v12.4s, v10.4s\n"
-
-            "fmla v13.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "fadd v12.4s, v12.4s, v11.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v14.4s\n"  // out2
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out2
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-            "fmax v12.4s, v12.4s, %[zero].4s                       \n"
-            "fmax v13.4s, v13.4s, %[zero].4s                       \n"
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vbias] "w"(wbias),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [zero] "w"(vzero),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-#else
-        unsigned int* vmask_ptr = vmask;
-        float bias_val = flag_bias ? bias[i] : 0.f;
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-            "vadd.f32 q4, q4, q10         @ q4 += q10 \n"
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-            "vadd.f32 q4, q4, q11         @ q4 += q10 \n"
-
-            "vadd.f32 q5, q5, q8         @ q4 += q10 \n"
-            "vadd.f32 q5, q5, q9         @ q4 += q10 \n"
-            "vmax.f32 q4, q4, %q[vzero]   @ relu \n"
-            "vmax.f32 q5, q5, %q[vzero]   @ relu \n"
-
-            "vst1.32  {d8-d9},   [%[out1]]  @ store result, add pointer\n"
-            "vst1.32  {d10-d11},   [%[out2]]  @ store result, add pointer\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vzero] "w"(vzero),
-              [bias_val] "r"(bias_val),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        // doutr0 = doutr1;
-        // doutr1 += w_out;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 7
- */
-void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      float out_buf[4];
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      for (int j = 0; j < h_out; ++j) {
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]]        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-            "and  v4.16b, %[bias].16b, %[bias].16b  \n"  // v10 = vbias
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v10.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v7.16b, v12.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v8.16b, v14.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-
-            "fmla v4.4s, v10.4s, %[wr0].s[0]                \n"   // 0246 * w00
-            "fmul v5.4s, v11.4s, %[wr0].s[1]                \n"   // 1357 * w01
-            "fmul v16.4s, v6.4s,  %[wr0].s[2]                \n"  // 2468  * w02
-
-            "fmla v4.4s, v12.4s, %[wr1].s[0]                \n"   // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[1]                \n"   // v13 * w12
-            "fmla v16.4s, v7.4s,  %[wr1].s[2]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[0]                \n"   // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[1]                \n"   // v15 * w21
-            "fmla v16.4s, v8.4s,  %[wr2].s[2]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v16.4s                       \n"
-            "fmax v4.4s, v4.4s, v9.4s                       \n"
-
-            // "fadd v4.4s, v4.4s, %[bias].4s                  \n"
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf),
-              [mask_ptr] "r"(mask_ptr)
-            : "cc",
-              "memory",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,0}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q7 = {2,4,6,0}
-            "vext.32 q8, q14, q9, #1                        @ shift left 1 \n"  // q8 = {2,4,6,0}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {0,2,4,6}
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // {1,3,5,7}
-            "vmla.f32 q3, q6,  %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {2,4,6,0}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                            @ relu \n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf),
-              [mask_ptr] "r"(mask_ptr)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_3x3p1.cc b/lite/backends/arm/math/conv_depthwise_3x3p1.cc
deleted file mode 100644
index 6f28d48d6d2bdd60e0c33f9b4b753835337fc8a4..0000000000000000000000000000000000000000
--- a/lite/backends/arm/math/conv_depthwise_3x3p1.cc
+++ /dev/null
@@ -1,4850 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_depthwise_3x3s1p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s2p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s1p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3s2p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3p1_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int stride,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx) {
-  if (stride == 1) {
-    if (flag_relu) {
-      if (w_in > 4) {
-        conv_depthwise_3x3s1p1_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s1p1_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 4) {
-        conv_depthwise_3x3s1p1_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s1p1_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  } else {  //! stride = 2
-    if (flag_relu) {
-      if (w_in > 7) {
-        conv_depthwise_3x3s2p1_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s2p1_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 7) {
-        conv_depthwise_3x3s2p1_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s2p1_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-// 4line
-void conv_depthwise_3x3s1p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  // printf("conv3x3_dw start \n");
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 3) >> 2;
-  int cnt_col = tile_w - 2;
-
-  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_in; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          din_ptr4 = dr3;
-          din_ptr5 = dr4;
-          dr0 = dr3;
-          dr1 = dr4;
-          dr2 = dr5;
-        } else {
-          dr0 = dr4;
-          dr1 = dr5;
-          dr2 = dr1 + w_in;
-        }
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 > h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = cnt_col;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v0.16b, v1.16b, #4 \n"        /* v16 = 1234 */
-
-            // left
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 *
-                                                    w0[1]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */
-            "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 *
-                                                      w0[0]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v2.16b, v3.16b, #4 \n"        /* v16 = 1234 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 *
-                                                     w0[1]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 *
-                                                     w1[1]*/
-            "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v4.16b, v5.16b, #4 \n"        /* v16 = 1234 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v6.16b, v7.16b, #4 \n"        /* v16 = 1234 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16 \n"    /* vst1q_f32() */
-            "st1 {v13.4s}, [%[doutr1]], #16 \n"    /* vst1q_f32() */
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */
-
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16 \n"    /* vst1q_f32() */
-            "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */
-            "cmp  %[cnt], #1                \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 1b \n"
-
-            // right
-            "3:                             \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-        // unsigned int* rst_mask = rmask;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"
-            "vext.32  q7, q8, q9, #1     @ 1234\n"
-
-            // left
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"
-
-            "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"
-            "vext.32  q7, q10, q11, #1     @ 1234\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"
-            "vext.32  q7, q12, q13, #1     @ 1234\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"
-            "vext.32  q7, q14, q15, #1     @ 1234\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "cmp %[cnt], #1                             @ check whether has "
-            "mid cols\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-            "blt  3f                                @ jump to main loop start "
-            "point\n"
-
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2
- */
-// w_in > 7
-void conv_depthwise_3x3s2p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  int size_pad_bottom = h_out * 2 - h_in;
-
-  int cnt_col = (w_out >> 2) - 2;
-  int size_right_remain = w_in - (7 + cnt_col * 8);
-  if (size_right_remain >= 9) {
-    cnt_col++;
-    size_right_remain -= 8;
-  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
-
-  int size_right_pad = w_out * 2 - w_in;
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          din4_ptr = dr3;
-          dr0 = dr3;
-          dr1 = dr4;
-        } else {
-          dr0 = dr4;
-          dr1 = dr0 + w_in;
-        }
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i / 2 + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = cnt_col;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   // {0,2,4,6} * w01
-            "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr0], %[inptr0], #4            \n"
-            "sub %[inptr1], %[inptr1], #4             \n"
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v12.4s, v3.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr2], %[inptr2], #4            \n"
-            "sub %[inptr3], %[inptr3], #4             \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[1]            \n"  // {0,2,4,6} * w01
-            "fmla v11.4s, v4.4s, %[w2].s[1]            \n"  // {0,2,4,6} * w01
-
-            "fmul v14.4s, v5.4s, %[w0].s[2]            \n"  // {1,3,5,7} * w02
-            "fmla v12.4s, v5.4s, %[w2].s[2]            \n"  // {1,3,5,7} * w02
-
-            "fmla v17.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-            "fmla v16.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr4], %[inptr4], #4            \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v7.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"  // v10 = {0,1,3,5}
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v9.4s, %[w2].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "cmp %[cnt], #1                             \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "blt 1f                                     \n"
-            // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"      // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-      for (int i = 0; i < h_in; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr0 + w_in;
-          dr2 = dr1 + w_in;
-        }
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = cnt_col;
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q10, q11
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q12, q13
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v13={0,2,4,6} v14={1,3,5,7}, q14, q15
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vext.32 q6, q9, q11, #3                        @ shift right 1 "
-            "data\n"  // q2 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"
-            "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"
-            "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, "
-            "out1\n"  // q0 * w01
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q1 * w02
-            "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q2 * w00
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "cmp %[cnt], #1                                 \n"
-            "blt 1f                                         \n"
-            // mid
-            "2:                                             \n"
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "subs %[cnt], #1                                \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-
-// 4line
-void conv_depthwise_3x3s1p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  // printf("conv3x3_dw start \n");
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 3) >> 2;
-  int tile_h = (h_in + 3) >> 2;
-  int cnt_col = tile_w - 2;
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
-  int size_pad_bottom = (unsigned int)(1 + (tile_h << 2) - h_in);
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_in; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          din_ptr4 = dr3;
-          din_ptr5 = dr4;
-          dr0 = dr3;
-          dr1 = dr4;
-          dr2 = dr5;
-        } else {
-          dr0 = dr4;
-          dr1 = dr5;
-          dr2 = dr1 + w_in;
-        }
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 > h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = cnt_col;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v0.16b, v1.16b, #4 \n"        /* v16 = 1234 */
-
-            // left
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 *
-                                                    w0[1]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */
-            "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 *
-                                                      w0[0]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v2.16b, v3.16b, #4 \n"        /* v16 = 1234 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 *
-                                                     w0[1]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 *
-                                                     w1[1]*/
-            "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v4.16b, v5.16b, #4 \n"        /* v16 = 1234 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v6.16b, v7.16b, #4 \n"        /* v16 = 1234 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */
-            "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */
-            "cmp  %[cnt], #1                \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v12.4s, v12.4s, %[vzero].4s \n"  /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v13.4s, v13.4s, %[vzero].4s \n"  /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v14.4s, v14.4s, %[vzero].4s \n"   /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 1b \n"
-
-            // right
-            "3:                             \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-        // unsigned int* rst_mask = rmask;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"
-            "vext.32  q7, q8, q9, #1     @ 1234\n"
-
-            // left
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"
-
-            "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"
-            "vext.32  q7, q10, q11, #1     @ 1234\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"
-            "vext.32  q7, q12, q13, #1     @ 1234\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"
-            "vext.32  q7, q14, q15, #1     @ 1234\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "cmp %[cnt], #1                             @ check whether has "
-            "mid cols\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-            "blt  3f                                @ jump to main loop start "
-            "point\n"
-
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, with reulu
- */
-// w_in > 7
-void conv_depthwise_3x3s2p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  int size_pad_bottom = h_out * 2 - h_in;
-
-  int cnt_col = (w_out >> 2) - 2;
-  int size_right_remain = w_in - (7 + cnt_col * 8);
-  if (size_right_remain >= 9) {
-    cnt_col++;
-    size_right_remain -= 8;
-  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
-
-  int size_right_pad = w_out * 2 - w_in;
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          din4_ptr = dr3;
-          dr0 = dr3;
-          dr1 = dr4;
-        } else {
-          dr0 = dr4;
-          dr1 = dr0 + w_in;
-        }
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i / 2 + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = cnt_col;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   // {0,2,4,6} * w01
-            "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr0], %[inptr0], #4            \n"
-            "sub %[inptr1], %[inptr1], #4             \n"
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v12.4s, v3.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr2], %[inptr2], #4            \n"
-            "sub %[inptr3], %[inptr3], #4             \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[1]            \n"  // {0,2,4,6} * w01
-            "fmla v11.4s, v4.4s, %[w2].s[1]            \n"  // {0,2,4,6} * w01
-
-            "fmul v14.4s, v5.4s, %[w0].s[2]            \n"  // {1,3,5,7} * w02
-            "fmla v12.4s, v5.4s, %[w2].s[2]            \n"  // {1,3,5,7} * w02
-
-            "fmla v17.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-            "fmla v16.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr4], %[inptr4], #4            \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v7.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"  // v10 = {0,1,3,5}
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v9.4s, %[w2].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "cmp %[cnt], #1                             \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "blt 1f                                     \n"
-            // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"      // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-
-      for (int i = 0; i < h_in; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr0 + w_in;
-          dr2 = dr1 + w_in;
-        }
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = cnt_col;
-
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q10, q11
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q12, q13
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v13={0,2,4,6} v14={1,3,5,7}, q14, q15
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vext.32 q6, q9, q11, #3                        @ shift right 1 "
-            "data\n"  // q2 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"
-            "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"
-            "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, "
-            "out1\n"  // q0 * w01
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q1 * w02
-            "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q2 * w00
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "cmp %[cnt], #1                                 \n"
-            "blt 1f                                         \n"
-            // mid
-            "2:                                             \n"
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "subs %[cnt], #1                                \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[4] = {3, 2, 1, 0};
-  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      int hs = -1;
-      int he = 3;
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      int h_cnt = (h_out + 1) >> 1;
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_cnt; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        if (hs == -1) {
-          dr0 = zero;
-        }
-
-        switch (he - h_in) {
-          case 2:
-            dr2 = zero;
-            doutr1 = trash_buf;
-          case 1:
-            dr3 = zero;
-          default:
-            break;
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s}, [%[din0]], #16\n"
-            "ld1 {v1.4s}, [%[din1]], #16\n"
-            "ld1 {v2.4s}, [%[din2]], #16\n"
-            "ld1 {v3.4s}, [%[din3]], #16\n"
-
-            "bif v0.16b, %[zero].16b, %[mask].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask].16b\n"  // d1_1234
-            "bif v2.16b, %[zero].16b, %[mask].16b\n"  // d2_1234
-            "bif v3.16b, %[zero].16b, %[mask].16b\n"  // d3_1234
-
-            "ext v4.16b, %[zero].16b, v0.16b, #12\n"  // d0_0123
-            "ext v5.16b, %[zero].16b, v1.16b, #12\n"  // d1_0123
-            "ext v6.16b, %[zero].16b, v2.16b, #12\n"  // d2_0123
-            "ext v7.16b, %[zero].16b, v3.16b, #12\n"  // d3_0123
-
-            "ext v8.16b, v0.16b, %[zero].16b, #4\n"   // d0_2340
-            "ext v9.16b, v1.16b, %[zero].16b, #4\n"   // d1_2340
-            "ext v10.16b, v2.16b, %[zero].16b, #4\n"  // d2_2340
-            "ext v11.16b, v3.16b, %[zero].16b, #4\n"  // d3_2340
-
-            "fmul v12.4s, v0.4s, %[wr0].s[1]\n"
-            "fmul v13.4s, v1.4s, %[wr0].s[1]\n"
-
-            "fmul v14.4s, v1.4s, %[wr1].s[1]\n"
-            "fmul v15.4s, v2.4s, %[wr1].s[1]\n"
-
-            "fmul v16.4s, v2.4s, %[wr2].s[1]\n"
-            "fmul v17.4s, v3.4s, %[wr2].s[1]\n"
-
-            "fmla v12.4s, v4.4s, %[wr0].s[0]\n"
-            "fmla v13.4s, v5.4s, %[wr0].s[0]\n"
-
-            "fmla v14.4s, v5.4s, %[wr1].s[0]\n"
-            "fmla v15.4s, v6.4s, %[wr1].s[0]\n"
-
-            "fmla v16.4s, v6.4s, %[wr2].s[0]\n"
-            "fmla v17.4s, v7.4s, %[wr2].s[0]\n"
-
-            "fmla v12.4s, v8.4s, %[wr0].s[2]\n"
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"
-
-            "fmla v14.4s, v9.4s, %[wr1].s[2]\n"
-            "fmla v15.4s, v10.4s, %[wr1].s[2]\n"
-
-            "fmla v16.4s, v10.4s, %[wr2].s[2]\n"
-            "fmla v17.4s, v11.4s, %[wr2].s[2]\n"
-
-            "fadd v12.4s, v12.4s, v14.4s\n"
-            "fadd v12.4s, v12.4s, v16.4s\n"
-
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v17.4s\n"  // out2
-
-            "fadd v12.4s, v12.4s, %[bias].4s\n"  // out1 add bias
-            "fadd v13.4s, v13.4s, %[bias].4s\n"  // out2 add bias
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17");
-#else
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32 {d12-d13}, [%[din0]]!\n"
-            "vld1.32 {d14-d15}, [%[din1]]!\n"
-            "vld1.32 {d16-d17}, [%[din2]]!\n"
-            "vld1.32 {d18-d19}, [%[din3]]!\n"
-
-            "vbif q6, %q[zero], %q[mask]\n"  // d0_1234
-            "vbif q7, %q[zero], %q[mask]\n"  // d1_1234
-            "vbif q8, %q[zero], %q[mask]\n"  // d2_1234
-            "vbif q9, %q[zero], %q[mask]\n"  // d3_1234
-
-            "vmul.f32 q14, q6, %e[wr0][1]\n"
-            "vmul.f32 q15, q7, %e[wr0][1]\n"
-
-            "vmla.f32 q14, q7, %e[wr1][1]\n"
-            "vmla.f32 q15, q8, %e[wr1][1]\n"
-
-            "vmla.f32 q14, q8, %e[wr2][1]\n"
-            "vmla.f32 q15, q9, %e[wr2][1]\n"
-
-            "vext.32 q10, %q[zero], q6, #3\n"  // d0_0123
-            "vext.32 q11, %q[zero], q7, #3\n"  // d1_0123
-            "vext.32 q12, %q[zero], q8, #3\n"  // d2_0123
-            "vext.32 q13, %q[zero], q9, #3\n"  // d3_0123
-
-            "vmla.f32 q14, q10, %e[wr0][0]\n"
-            "vmla.f32 q15, q11, %e[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %e[wr1][0]\n"
-            "vmla.f32 q15, q12, %e[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %e[wr2][0]\n"
-            "vmla.f32 q15, q13, %e[wr2][0]\n"
-
-            "vext.32 q10, q6, %q[zero], #1\n"  // d0_2340
-            "vext.32 q11, q7, %q[zero], #1\n"  // d1_2340
-            "vext.32 q12, q8, %q[zero], #1\n"  // d2_2340
-            "vext.32 q13, q9, %q[zero], #1\n"  // d3_2340
-
-            "vmla.f32 q14, q10, %f[wr0][0]\n"
-            "vmla.f32 q15, q11, %f[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %f[wr1][0]\n"
-            "vmla.f32 q15, q12, %f[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %f[wr2][0]\n"  // out1
-            "vmla.f32 q15, q13, %f[wr2][0]\n"  // out2
-
-            "vadd.f32 q14, q14, %q[bias]\n"  // out1 add bias
-            "vadd.f32 q15, q15, %q[bias]\n"  // out2 add bias
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vst1.32 {d28-d29}, [%[out1]]\n"
-            "vst1.32 {d30-d31}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        doutr0 = doutr1;
-        doutr1 += w_out;
-        hs += 2;
-        he += 2;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
- */
-
-void conv_depthwise_3x3s2p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      int hs = -1;
-      int he = 2;
-      float out_buf[4];
-      for (int j = 0; j < h_out; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        if (hs == -1) {
-          dr0 = zeros;
-        }
-        if (he > h_in) {
-          dr2 = zeros;
-        }
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v9.16b, v11.16b, #12               \n"  // v6 =
-                                                                 // {0,1,3,5}
-            "ext v7.16b, v9.16b, v13.16b, #12               \n"  // v7 =
-                                                                 // {0,1,3,5}
-            "ext v8.16b, v9.16b, v15.16b, #12               \n"  // v8 =
-                                                                 // {0,1,3,5}
-
-            "fmul v4.4s, v10.4s, %[wr0].s[1]                \n"  // v10 * w01
-            "fmul v5.4s, v11.4s, %[wr0].s[2]                \n"  // v11 * w02
-            "fmul v6.4s, v6.4s,  %[wr0].s[0]                \n"  // v6  * w00
-
-            "fmla v4.4s, v12.4s, %[wr1].s[1]                \n"  // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[2]                \n"  // v13 * w12
-            "fmla v6.4s, v7.4s,  %[wr1].s[0]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[1]                \n"  // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[2]                \n"  // v15 * w21
-            "fmla v6.4s, v8.4s,  %[wr2].s[0]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v6.4s                       \n"
-
-            "fadd v4.4s, v4.4s, %[bias].4s                  \n"
-
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf)
-            : "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q9, q11, #3                        @ shift left 1 \n"  // q6 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift left 1 \n"  // q7 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift left 1 \n"  // q8 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q10 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q11 * w02
-            "vmla.f32 q3, q6,  %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6  * w00
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-        hs += 2;
-        he += 2;
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[4] = {3, 2, 1, 0};
-  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      int hs = -1;
-      int he = 3;
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      int h_cnt = (h_out + 1) >> 1;
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_cnt; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        if (hs == -1) {
-          dr0 = zero;
-        }
-
-        switch (he - h_in) {
-          case 2:
-            dr2 = zero;
-            doutr1 = trash_buf;
-          case 1:
-            dr3 = zero;
-          default:
-            break;
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s}, [%[din0]], #16\n"
-            "ld1 {v1.4s}, [%[din1]], #16\n"
-            "ld1 {v2.4s}, [%[din2]], #16\n"
-            "ld1 {v3.4s}, [%[din3]], #16\n"
-
-            "bif v0.16b, %[zero].16b, %[mask].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask].16b\n"  // d1_1234
-            "bif v2.16b, %[zero].16b, %[mask].16b\n"  // d2_1234
-            "bif v3.16b, %[zero].16b, %[mask].16b\n"  // d3_1234
-
-            "ext v4.16b, %[zero].16b, v0.16b, #12\n"  // d0_0123
-            "ext v5.16b, %[zero].16b, v1.16b, #12\n"  // d1_0123
-            "ext v6.16b, %[zero].16b, v2.16b, #12\n"  // d2_0123
-            "ext v7.16b, %[zero].16b, v3.16b, #12\n"  // d3_0123
-
-            "ext v8.16b, v0.16b, %[zero].16b, #4\n"   // d0_2340
-            "ext v9.16b, v1.16b, %[zero].16b, #4\n"   // d1_2340
-            "ext v10.16b, v2.16b, %[zero].16b, #4\n"  // d2_2340
-            "ext v11.16b, v3.16b, %[zero].16b, #4\n"  // d3_2340
-
-            "fmul v12.4s, v0.4s, %[wr0].s[1]\n"
-            "fmul v13.4s, v1.4s, %[wr0].s[1]\n"
-
-            "fmul v14.4s, v1.4s, %[wr1].s[1]\n"
-            "fmul v15.4s, v2.4s, %[wr1].s[1]\n"
-
-            "fmul v16.4s, v2.4s, %[wr2].s[1]\n"
-            "fmul v17.4s, v3.4s, %[wr2].s[1]\n"
-
-            "fmla v12.4s, v4.4s, %[wr0].s[0]\n"
-            "fmla v13.4s, v5.4s, %[wr0].s[0]\n"
-
-            "fmla v14.4s, v5.4s, %[wr1].s[0]\n"
-            "fmla v15.4s, v6.4s, %[wr1].s[0]\n"
-
-            "fmla v16.4s, v6.4s, %[wr2].s[0]\n"
-            "fmla v17.4s, v7.4s, %[wr2].s[0]\n"
-
-            "fmla v12.4s, v8.4s, %[wr0].s[2]\n"
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"
-
-            "fmla v14.4s, v9.4s, %[wr1].s[2]\n"
-            "fmla v15.4s, v10.4s, %[wr1].s[2]\n"
-
-            "fmla v16.4s, v10.4s, %[wr2].s[2]\n"
-            "fmla v17.4s, v11.4s, %[wr2].s[2]\n"
-
-            "fadd v12.4s, v12.4s, v14.4s\n"
-            "fadd v12.4s, v12.4s, v16.4s\n"
-
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v17.4s\n"  // out2
-
-            "fadd v12.4s, v12.4s, %[bias].4s\n"  // out1 add bias
-            "fadd v13.4s, v13.4s, %[bias].4s\n"  // out2 add bias
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-
-            "fmax v12.4s, v12.4s, %[zero].4s\n"  // out1 -> relu
-            "fmax v13.4s, v13.4s, %[zero].4s\n"  // out2 -> relu
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17");
-#else
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32 {d12-d13}, [%[din0]]!\n"
-            "vld1.32 {d14-d15}, [%[din1]]!\n"
-            "vld1.32 {d16-d17}, [%[din2]]!\n"
-            "vld1.32 {d18-d19}, [%[din3]]!\n"
-
-            "vbif q6, %q[zero], %q[mask]\n"  // d0_1234
-            "vbif q7, %q[zero], %q[mask]\n"  // d1_1234
-            "vbif q8, %q[zero], %q[mask]\n"  // d2_1234
-            "vbif q9, %q[zero], %q[mask]\n"  // d3_1234
-
-            "vmul.f32 q14, q6, %e[wr0][1]\n"
-            "vmul.f32 q15, q7, %e[wr0][1]\n"
-
-            "vmla.f32 q14, q7, %e[wr1][1]\n"
-            "vmla.f32 q15, q8, %e[wr1][1]\n"
-
-            "vmla.f32 q14, q8, %e[wr2][1]\n"
-            "vmla.f32 q15, q9, %e[wr2][1]\n"
-
-            "vext.32 q10, %q[zero], q6, #3\n"  // d0_0123
-            "vext.32 q11, %q[zero], q7, #3\n"  // d1_0123
-            "vext.32 q12, %q[zero], q8, #3\n"  // d2_0123
-            "vext.32 q13, %q[zero], q9, #3\n"  // d3_0123
-
-            "vmla.f32 q14, q10, %e[wr0][0]\n"
-            "vmla.f32 q15, q11, %e[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %e[wr1][0]\n"
-            "vmla.f32 q15, q12, %e[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %e[wr2][0]\n"
-            "vmla.f32 q15, q13, %e[wr2][0]\n"
-
-            "vext.32 q10, q6, %q[zero], #1\n"  // d0_2340
-            "vext.32 q11, q7, %q[zero], #1\n"  // d1_2340
-            "vext.32 q12, q8, %q[zero], #1\n"  // d2_2340
-            "vext.32 q13, q9, %q[zero], #1\n"  // d3_2340
-
-            "vmla.f32 q14, q10, %f[wr0][0]\n"
-            "vmla.f32 q15, q11, %f[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %f[wr1][0]\n"
-            "vmla.f32 q15, q12, %f[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %f[wr2][0]\n"  // out1
-            "vmla.f32 q15, q13, %f[wr2][0]\n"  // out2
-
-            "vadd.f32 q14, q14, %q[bias]\n"  // out1 add bias
-            "vadd.f32 q15, q15, %q[bias]\n"  // out2 add bias
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vmax.f32 q14, q14, %q[zero]\n"  // out1 -> relu
-            "vmax.f32 q15, q15, %q[zero]\n"  // out2 -> relu
-
-            "vst1.32 {d28-d29}, [%[out1]]\n"
-            "vst1.32 {d30-d31}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        doutr0 = doutr1;
-        doutr1 += w_out;
-        hs += 2;
-        he += 2;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 7
- */
-void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      int hs = -1;
-      int he = 2;
-      float out_buf[4];
-      for (int j = 0; j < h_out; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        if (hs == -1) {
-          dr0 = zeros;
-        }
-        if (he > h_in) {
-          dr2 = zeros;
-        }
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v9.16b, v11.16b, #12               \n"  // v6 =
-                                                                 // {0,1,3,5}
-            "ext v7.16b, v9.16b, v13.16b, #12               \n"  // v7 =
-                                                                 // {0,1,3,5}
-            "ext v8.16b, v9.16b, v15.16b, #12               \n"  // v8 =
-                                                                 // {0,1,3,5}
-
-            "fmul v4.4s, v10.4s, %[wr0].s[1]                \n"  // v10 * w01
-            "fmul v5.4s, v11.4s, %[wr0].s[2]                \n"  // v11 * w02
-            "fmul v6.4s, v6.4s,  %[wr0].s[0]                \n"  // v6  * w00
-
-            "fmla v4.4s, v12.4s, %[wr1].s[1]                \n"  // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[2]                \n"  // v13 * w12
-            "fmla v6.4s, v7.4s,  %[wr1].s[0]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[1]                \n"  // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[2]                \n"  // v15 * w21
-            "fmla v6.4s, v8.4s,  %[wr2].s[0]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v6.4s                       \n"
-
-            "fadd v4.4s, v4.4s, %[bias].4s                  \n"  // out add bias
-            "fmax v4.4s, v4.4s, v9.4s                       \n"
-
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q9, q11, #3                        @ shift left 1 \n"  // q6 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift left 1 \n"  // q7 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift left 1 \n"  // q8 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q10 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q11 * w02
-            "vmla.f32 q3, q6,  %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6  * w00
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                            @ relu\n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-        hs += 2;
-        he += 2;
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_3x3s1.cc b/lite/backends/arm/math/conv_depthwise_3x3s1.cc
deleted file mode 100644
index 8d0ebb58ad1b7e325bae3649b13914641021038f..0000000000000000000000000000000000000000
--- a/lite/backends/arm/math/conv_depthwise_3x3s1.cc
+++ /dev/null
@@ -1,2539 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_depthwise_3x3s1p0_bias(float *dout,
-                                 const float *din,
-                                 const float *weights,
-                                 const float *bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext *ctx);
-
-void conv_depthwise_3x3s1p0_bias_s(float *dout,
-                                   const float *din,
-                                   const float *weights,
-                                   const float *bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext *ctx);
-
-void conv_depthwise_3x3s1p1_bias(float *dout,
-                                 const float *din,
-                                 const float *weights,
-                                 const float *bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext *ctx);
-
-void conv_depthwise_3x3s1p1_bias_s(float *dout,
-                                   const float *din,
-                                   const float *weights,
-                                   const float *bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext *ctx);
-
-void conv_depthwise_3x3s1_fp32(const float *din,
-                               float *dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float *weights,
-                               const float *bias,
-                               int pad,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext *ctx) {
-  if (pad == 0) {
-    if (w_in > 5) {
-      conv_depthwise_3x3s1p0_bias(dout,
-                                  din,
-                                  weights,
-                                  bias,
-                                  flag_bias,
-                                  flag_relu,
-                                  num,
-                                  ch_in,
-                                  h_in,
-                                  w_in,
-                                  h_out,
-                                  w_out,
-                                  ctx);
-    } else {
-      conv_depthwise_3x3s1p0_bias_s(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    flag_relu,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-    }
-  }
-  if (pad == 1) {
-    if (w_in > 4) {
-      conv_depthwise_3x3s1p1_bias(dout,
-                                  din,
-                                  weights,
-                                  bias,
-                                  flag_bias,
-                                  flag_relu,
-                                  num,
-                                  ch_in,
-                                  h_in,
-                                  w_in,
-                                  h_out,
-                                  w_out,
-                                  ctx);
-    } else {
-      conv_depthwise_3x3s1p1_bias_s(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    flag_relu,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-    }
-  }
-}
-
-#ifdef __aarch64__
-#define INIT_S1                                                   \
-  "PRFM PLDL1KEEP, [%[din_ptr0]] \n"                              \
-  "PRFM PLDL1KEEP, [%[din_ptr1]] \n"                              \
-  "PRFM PLDL1KEEP, [%[din_ptr2]] \n"                              \
-  "PRFM PLDL1KEEP, [%[din_ptr3]] \n"                              \
-  "PRFM PLDL1KEEP, [%[din_ptr4]] \n"                              \
-  "PRFM PLDL1KEEP, [%[din_ptr5]] \n"                              \
-  "movi   v21.4s, #0x0\n" /* out0 = 0 */                          \
-                                                                  \
-  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
-  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
-  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
-  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
-                                                                  \
-  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/       \
-  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/       \
-  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/       \
-  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/       \
-                                                                  \
-  "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/ \
-  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
-  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
-  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-#define LEFT_COMPUTE_S1                                                   \
-  "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n"           /* v16 = 00123*/ \
-  "ext  v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */         \
-  "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/   \
-                                                                          \
-  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/         \
-  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/         \
-  "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */                \
-  "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */                \
-                                                                          \
-  "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \
-                                                                          \
-  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/           \
-  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/           \
-  "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */                  \
-  "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */                  \
-                                                                          \
-  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n"           /* v16 = 00123*/ \
-  "ext  v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */         \
-  "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/  \
-  "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/  \
-  "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */                  \
-  "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */                  \
-                                                                          \
-  "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \
-                                                                          \
-  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
-                                                                          \
-  "ext v17.16b, v4.16b, v5.16b, #4 \n"         /* v16=1234 */             \
-  "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/           \
-                                                                          \
-  /* r2 */                                                                \
-  "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/  \
-  "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
-  "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
-                                                                          \
-  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
-  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-  "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
-                                                                          \
-  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
-                                                                          \
-  "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n"           /* v16 = 00123*/ \
-  "ext  v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */         \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
-                                                                          \
-  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-  "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-  "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
-                                                                          \
-  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
-  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
-                                                                          \
-  "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/           \
-  "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
-
-#define LEFT_RESULT_S1                                                      \
-  /* r4 */                                                                  \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/    \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/    \
-                                                                            \
-  "st1 {v12.4s}, [%[doutr0]], #16 \n"    /* vst1q_f32() */                  \
-  "st1 {v13.4s}, [%[doutr1]], #16 \n"    /* vst1q_f32() */                  \
-  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/            \
-                                                                            \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/   \
-                                                                            \
-  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/             \
-  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
-  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
-                                                                            \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/   \
-                                                                            \
-  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n"            /* v16 = 00123*/ \
-  "ext  v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */         \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/   \
-                                                                            \
-  "st1 {v14.4s}, [%[doutr2]], #16 \n"    /* vst1q_f32() */                  \
-  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/            \
-                                                                            \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
-                                                                            \
-  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/             \
-  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
-                                                                            \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
-                                                                            \
-  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                     \
-  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                    \
-                                                                            \
-  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                     \
-  "cmp  %w[cnt], #1                \n"                                      \
-  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
-                                                                            \
-  "blt 3f                         \n"
-
-#define MID_COMPUTE_S1                                                    \
-  "1:                             \n"   /* r0 */                          \
-  "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                          \
-  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
-  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
-  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                          \
-  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
-  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
-  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                          \
-  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
-  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-#define MID_RESULT_S1                                                      \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
-  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
-  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
-  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
-  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                           \
-  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
-  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
-                                                                           \
-  "subs %w[cnt], %w[cnt], #1 \n"                                           \
-                                                                           \
-  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
-  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "bne 1b \n"
-
-#define RIGHT_COMPUTE_S1                                                  \
-  "3:                             \n"                                     \
-  "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"                           \
-  "ld1 {v22.4s}, [%[doutr0]]         \n"                                  \
-  "ld1 {v23.4s}, [%[doutr1]]         \n"                                  \
-  "ld1 {v24.4s}, [%[doutr2]]         \n"                                  \
-  "ld1 {v25.4s}, [%[doutr3]]         \n"                                  \
-                                                                          \
-  "bif v0.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v1.16b, %[vzero].16b, v19.16b \n"                                  \
-  "bif v2.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v3.16b, %[vzero].16b, v19.16b \n"                                  \
-                                                                          \
-  "bif v4.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v5.16b, %[vzero].16b, v19.16b \n"                                  \
-  "bif v6.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v7.16b, %[vzero].16b, v19.16b \n"                                  \
-                                                                          \
-  "ext  v16.16b, v0.16b, v1.16b, #4 \n"                  /* v16 = 1234*/  \
-  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */         \
-  "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                          \
-  "bif v8.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v9.16b, %[vzero].16b, v19.16b \n"                                  \
-  "bif v10.16b, %[vzero].16b, v18.16b \n"                                 \
-  "bif v11.16b, %[vzero].16b, v19.16b \n"                                 \
-                                                                          \
-  "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                          \
-  "ld1 {v18.4s}, [%[rmask]]         \n"                                   \
-                                                                          \
-  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
-  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
-  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                          \
-  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
-  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
-  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                          \
-  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
-  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-#define RIGHT_RESULT_S1                                                    \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "bif v12.16b, v22.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
-  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "bif v13.16b, v23.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
-  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                           \
-  "bif v14.16b, v24.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "bif v15.16b, v25.16b, v18.16b \n"                                       \
-                                                                           \
-  "st1 {v15.4s}, [%[doutr3]], #16     \n"
-
-#define LEFT_RESULT_S1_RELU                                               \
-  /* r4 */                                                                \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
-                                                                          \
-  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                          \
-  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                          \
-                                                                          \
-  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
-                                                                          \
-  "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */                   \
-  "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */                   \
-                                                                          \
-  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
-                                                                          \
-  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/          \
-  "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */          \
-  "ld1 {v12.4s}, [%[bias_val]]      \n"         /*vdupq_n_f32(bias_val)*/ \
-  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \
-                                                                          \
-  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                          \
-                                                                          \
-  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-                                                                          \
-  "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */                   \
-                                                                          \
-  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/              \
-                                                                          \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-                                                                          \
-  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
-                                                                          \
-  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                   \
-  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                  \
-                                                                          \
-  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                          \
-                                                                          \
-  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                   \
-  "cmp  %w[cnt], #1                \n"                                    \
-  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
-  "blt 3f                         \n"
-
-#define MID_RESULT_S1_RELU                                                 \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
-  "fmax v12.4s, v12.4s, %[vzero].4s \n"  /*relu*/                          \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
-                                                                           \
-  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
-  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
-  "fmax v13.4s, v13.4s, %[vzero].4s \n"  /*relu*/                          \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
-                                                                           \
-  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
-  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
-                                                                           \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
-  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-  "fmax v14.4s, v14.4s, %[vzero].4s \n"   /*relu*/                         \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
-                                                                           \
-  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
-  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
-                                                                           \
-  "subs %w[cnt], %w[cnt], #1 \n"                                           \
-                                                                           \
-  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
-                                                                           \
-  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
-  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "bne 1b \n"
-
-#define RIGHT_RESULT_S1_RELU                                               \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                           \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "bif v12.16b, v22.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
-  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
-  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                           \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "bif v13.16b, v23.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
-  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
-                                                                           \
-  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                         \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
-                                                                           \
-  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                           \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "bif v14.16b, v24.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
-                                                                           \
-  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
-                                                                           \
-  "bif v15.16b, v25.16b, v18.16b \n"                                       \
-                                                                           \
-  "st1 {v15.4s}, [%[doutr3]], #16     \n"
-
-#define COMPUTE_S_S1                       \
-  "prfm pldl1keep, [%[din0]]\n"            \
-  "prfm pldl1keep, [%[din1]]\n"            \
-  "prfm pldl1keep, [%[din2]]\n"            \
-  "prfm pldl1keep, [%[din3]]\n"            \
-                                           \
-  "ld1 {v0.4s}, [%[din0]], #16\n"          \
-  "ld1 {v1.4s}, [%[din1]], #16\n"          \
-  "ld1 {v2.4s}, [%[din2]], #16\n"          \
-  "ld1 {v3.4s}, [%[din3]], #16\n"          \
-                                           \
-  "bif v0.16b, %[zero].16b, %[mask].16b\n" \
-  "bif v1.16b, %[zero].16b, %[mask].16b\n" \
-  "bif v2.16b, %[zero].16b, %[mask].16b\n" \
-  "bif v3.16b, %[zero].16b, %[mask].16b\n" \
-                                           \
-  "ext v4.16b, %[zero].16b, v0.16b, #12\n" \
-  "ext v5.16b, %[zero].16b, v1.16b, #12\n" \
-  "ext v6.16b, %[zero].16b, v2.16b, #12\n" \
-  "ext v7.16b, %[zero].16b, v3.16b, #12\n" \
-                                           \
-  "ext v8.16b, v0.16b, %[zero].16b, #4\n"  \
-  "ext v9.16b, v1.16b, %[zero].16b, #4\n"  \
-  "ext v10.16b, v2.16b, %[zero].16b, #4\n" \
-  "ext v11.16b, v3.16b, %[zero].16b, #4\n" \
-                                           \
-  "fmul v12.4s, v0.4s, %[wr0].s[1]\n"      \
-  "fmul v13.4s, v1.4s, %[wr0].s[1]\n"      \
-                                           \
-  "fmul v14.4s, v1.4s, %[wr1].s[1]\n"      \
-  "fmul v15.4s, v2.4s, %[wr1].s[1]\n"      \
-                                           \
-  "fmul v16.4s, v2.4s, %[wr2].s[1]\n"      \
-  "fmul v17.4s, v3.4s, %[wr2].s[1]\n"      \
-                                           \
-  "fmla v12.4s, v4.4s, %[wr0].s[0]\n"      \
-  "fmla v13.4s, v5.4s, %[wr0].s[0]\n"      \
-                                           \
-  "fmla v14.4s, v5.4s, %[wr1].s[0]\n"      \
-  "fmla v15.4s, v6.4s, %[wr1].s[0]\n"      \
-                                           \
-  "fmla v16.4s, v6.4s, %[wr2].s[0]\n"      \
-  "fmla v17.4s, v7.4s, %[wr2].s[0]\n"      \
-                                           \
-  "fmla v12.4s, v8.4s, %[wr0].s[2]\n"      \
-  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"      \
-                                           \
-  "fmla v14.4s, v9.4s, %[wr1].s[2]\n"      \
-  "fmla v15.4s, v10.4s, %[wr1].s[2]\n"     \
-                                           \
-  "fmla v16.4s, v10.4s, %[wr2].s[2]\n"     \
-  "fmla v17.4s, v11.4s, %[wr2].s[2]\n"     \
-                                           \
-  "fadd v12.4s, v12.4s, v14.4s\n"          \
-  "fadd v12.4s, v12.4s, v16.4s\n"          \
-                                           \
-  "fadd v13.4s, v13.4s, v15.4s\n"          \
-  "fadd v13.4s, v13.4s, v17.4s\n"          \
-                                           \
-  "fadd v12.4s, v12.4s, %[bias].4s\n"      \
-  "fadd v13.4s, v13.4s, %[bias].4s\n"
-
-#define RESULT_S_S1             \
-  "prfm pldl1keep, [%[out1]]\n" \
-  "prfm pldl1keep, [%[out2]]\n" \
-                                \
-  "st1 {v12.4s}, [%[out1]]\n"   \
-  "st1 {v13.4s}, [%[out2]]\n"
-
-#define RESULT_S_S1_RELU              \
-  "prfm pldl1keep, [%[out1]]\n"       \
-  "prfm pldl1keep, [%[out2]]\n"       \
-                                      \
-  "fmax v12.4s, v12.4s, %[zero].4s\n" \
-  "fmax v13.4s, v13.4s, %[zero].4s\n" \
-                                      \
-  "st1 {v12.4s}, [%[out1]]\n"         \
-  "st1 {v13.4s}, [%[out2]]\n"
-
-#define COMPUTE_S_S1_P0                                   \
-  "prfm pldl1keep, [%[din0]]\n"                           \
-  "prfm pldl1keep, [%[din1]]\n"                           \
-  "prfm pldl1keep, [%[din2]]\n"                           \
-  "prfm pldl1keep, [%[din3]]\n"                           \
-                                                          \
-  "ld1 {v0.4s, v1.4s}, [%[din0]]\n"                       \
-  "ld1 {v2.4s, v3.4s}, [%[din1]]\n"                       \
-  "ld1 {v4.4s, v5.4s}, [%[din2]]\n"                       \
-  "ld1 {v6.4s, v7.4s}, [%[din3]]\n"                       \
-                                                          \
-  "bif v0.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v1.16b, %[zero].16b, %[mask2].16b\n"               \
-                                                          \
-  "bif v2.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v3.16b, %[zero].16b, %[mask2].16b\n"               \
-                                                          \
-  "bif v4.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v5.16b, %[zero].16b, %[mask2].16b\n"               \
-                                                          \
-  "bif v6.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v7.16b, %[zero].16b, %[mask2].16b\n"               \
-                                                          \
-  "ext v8.16b, v0.16b, v1.16b, #4\n"                      \
-  "ext v9.16b, v0.16b, v1.16b, #8\n"                      \
-                                                          \
-  "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"          \
-  "and  v13.16b, %[vbias].16b, %[vbias].16b  \n" /* r0 */ \
-  "fmul v10.4s, v0.4s, %[wr0].s[0]\n"                     \
-  "fmul v11.4s, v8.4s, %[wr0].s[1]\n"                     \
-  "fmla v12.4s, v9.4s, %[wr0].s[2]\n"                     \
-                                                          \
-  "ext v8.16b, v2.16b, v3.16b, #4\n"                      \
-  "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */             \
-  "fmul v14.4s, v2.4s, %[wr0].s[0]\n"                     \
-  "fmla v10.4s, v2.4s, %[wr1].s[0]\n"                     \
-                                                          \
-  "fmul v15.4s, v8.4s, %[wr0].s[1]\n"                     \
-  "fmla v11.4s, v8.4s, %[wr1].s[1]\n"                     \
-                                                          \
-  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"                     \
-  "fmla v12.4s, v9.4s, %[wr1].s[2]\n"                     \
-                                                          \
-  "ext v8.16b, v4.16b, v5.16b, #4\n"                      \
-  "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */             \
-  "fmla v14.4s, v4.4s, %[wr1].s[0]\n"                     \
-  "fmla v10.4s, v4.4s, %[wr2].s[0]\n"                     \
-                                                          \
-  "fmla v15.4s, v8.4s, %[wr1].s[1]\n"                     \
-  "fmla v11.4s, v8.4s, %[wr2].s[1]\n"                     \
-                                                          \
-  "fmla v13.4s, v9.4s, %[wr1].s[2]\n"                     \
-  "fmla v12.4s, v9.4s, %[wr2].s[2]\n"                     \
-                                                          \
-  "ext v8.16b, v6.16b, v7.16b, #4\n"                      \
-  "ext v9.16b, v6.16b, v7.16b, #8\n"                      \
-                                                          \
-  "fmla v14.4s, v6.4s, %[wr2].s[0]\n"                     \
-                                                          \
-  "fmla v15.4s, v8.4s, %[wr2].s[1]\n"                     \
-                                                          \
-  "fadd v12.4s, v12.4s, v10.4s\n"                         \
-                                                          \
-  "fmla v13.4s, v9.4s, %[wr2].s[2]\n"                     \
-                                                          \
-  "fadd v12.4s, v12.4s, v11.4s\n"                         \
-  "fadd v13.4s, v13.4s, v14.4s\n"                         \
-  "fadd v13.4s, v13.4s, v15.4s\n"  // \
-                    // "prfm pldl1keep, [%[out1]]\n" \
-                    // "prfm pldl1keep, [%[out2]]\n" \
-                    // \
-                    // "st1 {v12.4s}, [%[out1]]\n" \
-                    // "st1 {v13.4s}, [%[out2]]\n" \
-
-
-#else
-#define INIT_S1                                                    \
-  "pld [%[din0_ptr]]                             @ preload data\n" \
-  "pld [%[din1_ptr]]                      @ preload data\n"        \
-  "pld [%[din2_ptr]]                      @ preload data\n"        \
-  "pld [%[din3_ptr]]                      @ preload data\n"        \
-                                                                   \
-  "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"          \
-  "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"          \
-  "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"          \
-  "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"          \
-                                                                   \
-  "vdup.32 q4, %[bias_val]                            @ and \n"    \
-  "vdup.32 q5, %[bias_val]                            @ and \n"
-
-#define LEFT_COMPUTE_S1                                            \
-  "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"                    \
-  "vext.32  q7, q8, q9, #1     @ 1234\n" /* r0 */                  \
-  "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"           \
-                                                                   \
-  "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"           \
-  "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"           \
-  "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"           \
-  "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"           \
-                                                                   \
-  "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
-                                                                   \
-  "pld [%[din0_ptr]]                             @ preload data\n" \
-  "pld [%[din1_ptr]]                             @ preload data\n" \
-  "pld [%[din2_ptr]]                             @ preload data\n" \
-  "pld [%[din3_ptr]]                             @ preload data\n" \
-                                                                   \
-  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
-                                                                   \
-  "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"                   \
-  "vext.32  q7, q10, q11, #1     @ 1234\n"                         \
-                                                                   \
-  /* r1 */                                                         \
-  "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"          \
-  "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
-                                                                   \
-  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"          \
-  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"          \
-                                                                   \
-  "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
-  "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
-                                                                   \
-  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"               \
-  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"               \
-                                                                   \
-  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
-  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
-                                                                   \
-  "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"                   \
-  "vext.32  q7, q12, q13, #1     @ 1234\n"                         \
-                                                                   \
-  /* r2 */                                                         \
-  "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
-  "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"          \
-                                                                   \
-  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"          \
-                                                                   \
-  "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
-  "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"           \
-                                                                   \
-  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"               \
-                                                                   \
-  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
-  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"           \
-                                                                   \
-  "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"                   \
-  "vext.32  q7, q14, q15, #1     @ 1234\n"
-
-#define LEFT_RESULT_S1                                                        \
-  /* r3 */                                                                    \
-  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
-                                                                              \
-  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
-                                                                              \
-  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
-  "vdup.32 q4, %[bias_val]                            @ and \n"               \
-                                                                              \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
-                                                                              \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
-  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
-  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
-                                                                              \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
-                                                                              \
-  "vdup.32 q5, %[bias_val]                            @ and \n"               \
-  "blt  3f                                @ jump to main loop start point\n"
-
-#define MID_COMPUTE_S1                                                 \
-  "1:                                    @ right pad entry\n" /* r0 */ \
-  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"               \
-                                                                       \
-  "pld [%[din0_ptr]]                             @ preload data\n"     \
-  "pld [%[din1_ptr]]                             @ preload data\n"     \
-  "pld [%[din2_ptr]]                             @ preload data\n"     \
-  "pld [%[din3_ptr]]                             @ preload data\n"     \
-                                                                       \
-  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
-                                                                       \
-  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"              \
-                                                                       \
-  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"               \
-                                                                       \
-  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"                   \
-                                                                       \
-  "vext.32  q6, q10, q11, #1     @ 1234\n"                             \
-  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                    \
-  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"              \
-  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"              \
-                                                                       \
-  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"              \
-                                                                       \
-  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
-  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
-                                                                       \
-  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"                   \
-                                                                       \
-  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"               \
-  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
-                                                                       \
-  "vext.32  q6, q12, q13, #1     @ 1234\n"                             \
-  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                    \
-  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"              \
-  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"              \
-                                                                       \
-  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"              \
-                                                                       \
-  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
-  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"               \
-                                                                       \
-  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"                   \
-                                                                       \
-  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
-  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"               \
-                                                                       \
-  "vext.32  q6, q14, q15, #1     @ 1234\n"                             \
-  "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-#define MID_RESULT_S1                                                    \
-  /* r3 */                                                               \
-  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
-                                                                         \
-  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
-                                                                         \
-  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
-                                                                         \
-  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
-  "vdup.32 q4, %[bias_val]                            @ and \n"          \
-                                                                         \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
-                                                                         \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
-  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
-                                                                         \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
-                                                                         \
-  "subs %[cnt], #1 @ loop count minus 1\n"                               \
-                                                                         \
-  "vdup.32 q5, %[bias_val]                            @ and \n"          \
-                                                                         \
-  "bne    1b                             @ jump to main loop start point\n"
-
-#define RIGHT_COMPUTE_S1                                                      \
-  "3:                                    @ right pad entry\n"                 \
-  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
-  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
-                                                                              \
-  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
-  "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"                            \
-                                                                              \
-  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
-  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
-  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
-                                                                              \
-  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
-                                                                              \
-  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
-  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
-  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
-  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"                            \
-  "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"                            \
-                                                                              \
-  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"                     \
-  "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"                     \
-                                                                              \
-  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
-  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
-  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
-  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
-  "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-#define RIGHT_RESULT_S1                                                 \
-  /* r3 */                                                              \
-  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
-                                                                        \
-  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
-  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
-                                                                        \
-  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
-                                                                        \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
-                                                                        \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
-                                                                        \
-  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
-  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
-                                                                        \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
-
-#define LEFT_RESULT_S1_RELU                                                   \
-  /* r3 */                                                                    \
-  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
-  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                                    \
-                                                                              \
-  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
-                                                                              \
-  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
-                                                                              \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
-                                                                              \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
-  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
-  "vdup.32 q4, %[bias_val]                            @ and \n"               \
-                                                                              \
-  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                                    \
-                                                                              \
-  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
-                                                                              \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
-                                                                              \
-  "vdup.32 q5, %[bias_val]                            @ and \n"               \
-  "blt  3f                                @ jump to main loop start point\n"
-
-#define MID_RESULT_S1_RELU                                               \
-  /* r3 */                                                               \
-  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
-                                                                         \
-  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
-  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                               \
-                                                                         \
-  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
-                                                                         \
-  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
-                                                                         \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
-                                                                         \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
-  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
-  "vdup.32 q4, %[bias_val]                            @ and \n"          \
-                                                                         \
-  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                               \
-                                                                         \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
-                                                                         \
-  "subs %[cnt], #1 @ loop count minus 1\n"                               \
-                                                                         \
-  "vdup.32 q5, %[bias_val]                            @ and \n"          \
-                                                                         \
-  "bne    1b                             @ jump to main loop start point\n"
-
-#define RIGHT_RESULT_S1_RELU                                            \
-  /* r3 */                                                              \
-  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
-                                                                        \
-  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                              \
-                                                                        \
-  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
-                                                                        \
-  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
-  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
-                                                                        \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
-                                                                        \
-  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                              \
-                                                                        \
-  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
-  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
-                                                                        \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
-
-#define COMPUTE_S_S1                 \
-  "pld [%[din0]]\n"                  \
-  "pld [%[din1]]\n"                  \
-  "pld [%[din2]]\n"                  \
-  "pld [%[din3]]\n"                  \
-                                     \
-  "vld1.32 {d12-d13}, [%[din0]]!\n"  \
-  "vld1.32 {d14-d15}, [%[din1]]!\n"  \
-  "vld1.32 {d16-d17}, [%[din2]]!\n"  \
-  "vld1.32 {d18-d19}, [%[din3]]!\n"  \
-                                     \
-  "vbif q6, %q[vzero], %q[mask]\n"   \
-  "vbif q7, %q[vzero], %q[mask]\n"   \
-  "vbif q8, %q[vzero], %q[mask]\n"   \
-  "vbif q9, %q[vzero], %q[mask]\n"   \
-                                     \
-  "vmul.f32 q14, q6, %e[wr0][1]\n"   \
-  "vmul.f32 q15, q7, %e[wr0][1]\n"   \
-                                     \
-  "vmla.f32 q14, q7, %e[wr1][1]\n"   \
-  "vmla.f32 q15, q8, %e[wr1][1]\n"   \
-                                     \
-  "vmla.f32 q14, q8, %e[wr2][1]\n"   \
-  "vmla.f32 q15, q9, %e[wr2][1]\n"   \
-                                     \
-  "vext.32 q10, %q[vzero], q6, #3\n" \
-  "vext.32 q11, %q[vzero], q7, #3\n" \
-  "vext.32 q12, %q[vzero], q8, #3\n" \
-  "vext.32 q13, %q[vzero], q9, #3\n" \
-                                     \
-  "vmla.f32 q14, q10, %e[wr0][0]\n"  \
-  "vmla.f32 q15, q11, %e[wr0][0]\n"  \
-                                     \
-  "vmla.f32 q14, q11, %e[wr1][0]\n"  \
-  "vmla.f32 q15, q12, %e[wr1][0]\n"  \
-                                     \
-  "vmla.f32 q14, q12, %e[wr2][0]\n"  \
-  "vmla.f32 q15, q13, %e[wr2][0]\n"  \
-                                     \
-  "vext.32 q10, q6, %q[vzero], #1\n" \
-  "vext.32 q11, q7, %q[vzero], #1\n" \
-  "vext.32 q12, q8, %q[vzero], #1\n" \
-  "vext.32 q13, q9, %q[vzero], #1\n" \
-                                     \
-  "vmla.f32 q14, q10, %f[wr0][0]\n"  \
-  "vmla.f32 q15, q11, %f[wr0][0]\n"  \
-                                     \
-  "vmla.f32 q14, q11, %f[wr1][0]\n"  \
-  "vmla.f32 q15, q12, %f[wr1][0]\n"  \
-                                     \
-  "vmla.f32 q14, q12, %f[wr2][0]\n"  \
-  "vmla.f32 q15, q13, %f[wr2][0]\n"  \
-                                     \
-  "vadd.f32 q14, q14, %q[bias]\n"    \
-  "vadd.f32 q15, q15, %q[bias]\n"
-
-#define RESULT_S_S1                \
-  "pld [%[out1]]\n"                \
-  "pld [%[out2]]\n"                \
-                                   \
-  "vst1.32 {d28-d29}, [%[out1]]\n" \
-  "vst1.32 {d30-d31}, [%[out2]]\n"
-
-#define RESULT_S_S1_RELU           \
-  "pld [%[out1]]\n"                \
-  "pld [%[out2]]\n"                \
-                                   \
-  "vmax.f32 q14, q14, %q[vzero]\n" \
-  "vmax.f32 q15, q15, %q[vzero]\n" \
-                                   \
-  "vst1.32 {d28-d29}, [%[out1]]\n" \
-  "vst1.32 {d30-d31}, [%[out2]]\n"
-
-#define COMPUTE_S_S1_P0                                                       \
-  "pld [%[din0]]\n"                                                           \
-  "pld [%[din1]]\n"                                                           \
-  "pld [%[din2]]\n"                                                           \
-  "pld [%[din3]]\n"                                                           \
-  "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"                          \
-  "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"                          \
-  "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"                          \
-  "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"                          \
-                                                                              \
-  "vdup.32 q4, %[bias_val]                            @ and \n"               \
-  "vdup.32 q5, %[bias_val]                            @ and \n"               \
-                                                                              \
-  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
-  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
-                                                                              \
-  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
-                                                                              \
-  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
-                                                                              \
-  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
-                                                                              \
-  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
-  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
-  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
-                                                                              \
-  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
-                                                                              \
-  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
-  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
-  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
-  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
-  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
-  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
-  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
-  "vext.32  q7, q14, q15, #2     @ 2345\n" /* r3 */                           \
-  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                     \
-                                                                              \
-  "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vadd.f32 q4, q4, q10         @ q4 += q10 \n"                               \
-                                                                              \
-  "pld [%[out1]]\n"                                                           \
-  "pld [%[out2]]\n"                                                           \
-                                                                              \
-  "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                      \
-  "vadd.f32 q14, q4, q11         @ q4 += q10 \n"                              \
-                                                                              \
-  "vadd.f32 q5, q5, q8         @ q4 += q10 \n"                                \
-  "vadd.f32 q15, q5, q9         @ q4 += q10 \n"
-
-#endif
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-void conv_depthwise_3x3s1p1_bias(float *dout,
-                                 const float *din,
-                                 const float *weights,
-                                 const float *bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext *ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float *zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float *write_ptr = zero_ptr + w_in;
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 3) >> 2;
-  int cnt_col = tile_w - 2;
-
-  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      float *dout_ptr = dout_batch + c * size_out_channel;
-
-      const float *din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float *wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float *doutr0 = dout_ptr;
-      float *doutr1 = doutr0 + w_out;
-      float *doutr2 = doutr1 + w_out;
-      float *doutr3 = doutr2 + w_out;
-
-      const float *dr0 = din_ch_ptr;
-      const float *dr1 = dr0 + w_in;
-      const float *dr2 = dr1 + w_in;
-      const float *dr3 = dr2 + w_in;
-      const float *dr4 = dr3 + w_in;
-      const float *dr5 = dr4 + w_in;
-
-      const float *din_ptr0 = dr0;
-      const float *din_ptr1 = dr1;
-      const float *din_ptr2 = dr2;
-      const float *din_ptr3 = dr3;
-      const float *din_ptr4 = dr4;
-      const float *din_ptr5 = dr5;
-      float *ptr_zero = const_cast<float *>(zero);
-#ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          din_ptr4 = dr3;
-          din_ptr5 = dr4;
-          dr0 = dr3;
-          dr1 = dr4;
-          dr2 = dr5;
-        } else {
-          dr0 = dr4;
-          dr1 = dr5;
-          dr2 = dr1 + w_in;
-        }
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 > h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = cnt_col;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
-                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
-              : [cnt] "+r"(cnt),
-                [din_ptr0] "+r"(din_ptr0),
-                [din_ptr1] "+r"(din_ptr1),
-                [din_ptr2] "+r"(din_ptr2),
-                [din_ptr3] "+r"(din_ptr3),
-                [din_ptr4] "+r"(din_ptr4),
-                [din_ptr5] "+r"(din_ptr5),
-                [doutr0] "+r"(doutr0),
-                [doutr1] "+r"(doutr1),
-                [doutr2] "+r"(doutr2),
-                [doutr3] "+r"(doutr3)
-              : [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [bias_val] "r"(vbias),
-                [vmask] "r"(vmask),
-                [rmask] "r"(rmask),
-                [vzero] "w"(vzero)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23",
-                "v24",
-                "v25");
-        } else {
-          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
-                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
-                       : [cnt] "+r"(cnt),
-                         [din_ptr0] "+r"(din_ptr0),
-                         [din_ptr1] "+r"(din_ptr1),
-                         [din_ptr2] "+r"(din_ptr2),
-                         [din_ptr3] "+r"(din_ptr3),
-                         [din_ptr4] "+r"(din_ptr4),
-                         [din_ptr5] "+r"(din_ptr5),
-                         [doutr0] "+r"(doutr0),
-                         [doutr1] "+r"(doutr1),
-                         [doutr2] "+r"(doutr2),
-                         [doutr3] "+r"(doutr3)
-                       : [w0] "w"(wr0),
-                         [w1] "w"(wr1),
-                         [w2] "w"(wr2),
-                         [bias_val] "r"(vbias),
-                         [vmask] "r"(vmask),
-                         [rmask] "r"(rmask),
-                         [vzero] "w"(vzero)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17",
-                         "v18",
-                         "v19",
-                         "v20",
-                         "v21",
-                         "v22",
-                         "v23",
-                         "v24",
-                         "v25");
-        }
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-#else
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-
-        doutr0 = dout_ptr;
-        doutr1 = dout_ptr + w_out;
-        // unsigned int* rst_mask = rmask;
-
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din_ptr1 = zero_ptr;
-            case 2:
-              din_ptr2 = zero_ptr;
-            case 1:
-              din_ptr3 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-        unsigned int *rmask_ptr = rmask;
-        unsigned int *vmask_ptr = vmask;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
-                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
-              : [dout_ptr1] "+r"(doutr0),
-                [dout_ptr2] "+r"(doutr1),
-                [din0_ptr] "+r"(din_ptr0),
-                [din1_ptr] "+r"(din_ptr1),
-                [din2_ptr] "+r"(din_ptr2),
-                [din3_ptr] "+r"(din_ptr3),
-                [cnt] "+r"(cnt),
-                [rmask] "+r"(rmask_ptr),
-                [vmask] "+r"(vmask_ptr)
-              : [wr0] "w"(wr0),
-                [wr1] "w"(wr1),
-                [wr2] "w"(wr2),
-                [bias_val] "r"(bias_val),
-                [vzero] "w"(vzero)
-              : "cc",
-                "memory",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-        } else {
-          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
-                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
-                       : [dout_ptr1] "+r"(doutr0),
-                         [dout_ptr2] "+r"(doutr1),
-                         [din0_ptr] "+r"(din_ptr0),
-                         [din1_ptr] "+r"(din_ptr1),
-                         [din2_ptr] "+r"(din_ptr2),
-                         [din3_ptr] "+r"(din_ptr3),
-                         [cnt] "+r"(cnt),
-                         [rmask] "+r"(rmask_ptr),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias_val] "r"(bias_val),
-                         [vzero] "w"(vzero)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-        dout_ptr += 2 * w_out;
-      }  //! end of processing mid rows
-#endif
-    }
-  }
-}
-
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p1_bias_s(float *dout,
-                                   const float *din,
-                                   const float *weights,
-                                   const float *bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext *ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[4] = {3, 2, 1, 0};
-  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float *dout_channel = dout_batch + i * size_out_channel;
-      const float *din_channel = din_batch + i * size_in_channel;
-      const float *weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      int hs = -1;
-      int he = 3;
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      int h_cnt = (h_out + 1) >> 1;
-      float *doutr0 = dout_channel;
-      float *doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_cnt; ++j) {
-        const float *dr0 = din_channel + hs * w_in;
-        const float *dr1 = dr0 + w_in;
-        const float *dr2 = dr1 + w_in;
-        const float *dr3 = dr2 + w_in;
-
-        if (hs == -1) {
-          dr0 = zero;
-        }
-
-        switch (he - h_in) {
-          case 2:
-            dr2 = zero;
-            doutr1 = trash_buf;
-          case 1:
-            dr3 = zero;
-          default:
-            break;
-        }
-#ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [zero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17");
-        } else {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [zero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17");
-        }
-#else
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-#endif
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        doutr0 = doutr1;
-        doutr1 += w_out;
-        hs += 2;
-        he += 2;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-void conv_depthwise_3x3s1p0_bias(float *dout,
-                                 const float *din,
-                                 const float *weights,
-                                 const float *bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext *ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float *zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float *write_ptr = zero_ptr + w_in;
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = w_out >> 2;
-  int remain = w_out % 4;
-
-  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
-  const int remian_idx[4] = {0, 1, 2, 3};
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      float *dout_ptr = dout_batch + c * size_out_channel;
-
-      const float *din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float *wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float *doutr0 = dout_ptr;
-      float *doutr1 = doutr0 + w_out;
-      float *doutr2 = doutr1 + w_out;
-      float *doutr3 = doutr2 + w_out;
-
-      const float *dr0 = din_ch_ptr;
-      const float *dr1 = dr0 + w_in;
-      const float *dr2 = dr1 + w_in;
-      const float *dr3 = dr2 + w_in;
-      const float *dr4 = dr3 + w_in;
-      const float *dr5 = dr4 + w_in;
-
-      const float *din_ptr0 = dr0;
-      const float *din_ptr1 = dr1;
-      const float *din_ptr2 = dr2;
-      const float *din_ptr3 = dr3;
-      const float *din_ptr4 = dr4;
-      const float *din_ptr5 = dr5;
-
-      float *ptr_zero = const_cast<float *>(zero);
-#ifdef __aarch64__
-      for (int i = 0; i < h_out; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr5;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 >= h_in) {
-          switch (i + 5 - h_in) {
-            case 4:
-              din_ptr1 = zero_ptr;
-            case 3:
-              din_ptr2 = zero_ptr;
-            case 2:
-              din_ptr3 = zero_ptr;
-            case 1:
-              din_ptr4 = zero_ptr;
-            case 0:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = tile_w;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S1
-              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-              MID_COMPUTE_S1 MID_RESULT_S1_RELU
-              "cmp  %w[remain], #1             \n"
-              "blt 0f                         \n" RIGHT_COMPUTE_S1
-                  RIGHT_RESULT_S1_RELU "0: \n"
-              : [cnt] "+r"(cnt),
-                [din_ptr0] "+r"(din_ptr0),
-                [din_ptr1] "+r"(din_ptr1),
-                [din_ptr2] "+r"(din_ptr2),
-                [din_ptr3] "+r"(din_ptr3),
-                [din_ptr4] "+r"(din_ptr4),
-                [din_ptr5] "+r"(din_ptr5),
-                [doutr0] "+r"(doutr0),
-                [doutr1] "+r"(doutr1),
-                [doutr2] "+r"(doutr2),
-                [doutr3] "+r"(doutr3)
-              : [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [bias_val] "r"(vbias),
-                [vmask] "r"(vmask),
-                [rmask] "r"(rmask),
-                [vzero] "w"(vzero),
-                [remain] "r"(remain)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23",
-                "v24",
-                "v25");
-        } else {
-          asm volatile(
-              INIT_S1
-              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-              MID_COMPUTE_S1 MID_RESULT_S1
-              "cmp  %w[remain], #1             \n"
-              "blt 0f                         \n" RIGHT_COMPUTE_S1
-                  RIGHT_RESULT_S1 "0: \n"
-              : [cnt] "+r"(cnt),
-                [din_ptr0] "+r"(din_ptr0),
-                [din_ptr1] "+r"(din_ptr1),
-                [din_ptr2] "+r"(din_ptr2),
-                [din_ptr3] "+r"(din_ptr3),
-                [din_ptr4] "+r"(din_ptr4),
-                [din_ptr5] "+r"(din_ptr5),
-                [doutr0] "+r"(doutr0),
-                [doutr1] "+r"(doutr1),
-                [doutr2] "+r"(doutr2),
-                [doutr3] "+r"(doutr3)
-              : [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [bias_val] "r"(vbias),
-                [vmask] "r"(vmask),
-                [rmask] "r"(rmask),
-                [vzero] "w"(vzero),
-                [remain] "r"(remain)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23",
-                "v24",
-                "v25");
-        }
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-#else
-      for (int i = 0; i < h_out; i += 2) {
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-
-        doutr0 = dout_ptr;
-        doutr1 = dout_ptr + w_out;
-
-        dr0 = dr2;
-        dr1 = dr3;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        //! process bottom pad
-        if (i + 3 >= h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din_ptr1 = zero_ptr;
-            case 2:
-              din_ptr2 = zero_ptr;
-            case 1:
-              din_ptr3 = zero_ptr;
-            case 0:
-              din_ptr3 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = tile_w;
-        unsigned int *rmask_ptr = rmask;
-        unsigned int *vmask_ptr = vmask;
-        if (flag_relu) {
-          asm volatile(INIT_S1
-                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "vext.32  q6, q8, q9, #1     @ 0012\n"
-                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                           MID_RESULT_S1_RELU
-                       "cmp  %[remain], #1             \n"
-                       "blt 0f                         \n" RIGHT_COMPUTE_S1
-                           RIGHT_RESULT_S1_RELU "0:                         \n"
-                       : [dout_ptr1] "+r"(doutr0),
-                         [dout_ptr2] "+r"(doutr1),
-                         [din0_ptr] "+r"(din_ptr0),
-                         [din1_ptr] "+r"(din_ptr1),
-                         [din2_ptr] "+r"(din_ptr2),
-                         [din3_ptr] "+r"(din_ptr3),
-                         [cnt] "+r"(cnt),
-                         [rmask] "+r"(rmask_ptr),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias_val] "r"(bias_val),
-                         [vzero] "w"(vzero),
-                         [remain] "r"(remain)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(INIT_S1
-                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "vext.32  q6, q8, q9, #1     @ 0012\n"
-                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                           MID_RESULT_S1
-                       "cmp  %[remain], #1             \n"
-                       "blt 0f                         \n" RIGHT_COMPUTE_S1
-                           RIGHT_RESULT_S1 "0:                         \n"
-                       : [dout_ptr1] "+r"(doutr0),
-                         [dout_ptr2] "+r"(doutr1),
-                         [din0_ptr] "+r"(din_ptr0),
-                         [din1_ptr] "+r"(din_ptr1),
-                         [din2_ptr] "+r"(din_ptr2),
-                         [din3_ptr] "+r"(din_ptr3),
-                         [cnt] "+r"(cnt),
-                         [rmask] "+r"(rmask_ptr),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias_val] "r"(bias_val),
-                         [vzero] "w"(vzero),
-                         [remain] "r"(remain)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-        dout_ptr += 2 * w_out;
-      }  //! end of processing mid rows
-#endif
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p0_bias_s(float *dout,
-                                   const float *din,
-                                   const float *weights,
-                                   const float *bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext *ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp1 =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
-  uint32x4_t vmask_rp2 =
-      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float *dout_channel = dout_batch + i * size_out_channel;
-      const float *din_channel = din_batch + i * size_in_channel;
-      const float *weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-#ifdef __aarch64__
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-#endif  // __aarch64__
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      float *doutr0 = dout_channel;
-      float *doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_out; j += 2) {
-        const float *dr0 = din_channel + j * w_in;
-        const float *dr1 = dr0 + w_in;
-        const float *dr2 = dr1 + w_in;
-        const float *dr3 = dr2 + w_in;
-
-        doutr0 = dout_channel + j * w_out;
-        doutr1 = doutr0 + w_out;
-
-        if (j + 3 >= h_in) {
-          switch (j + 3 - h_in) {
-            case 3:
-              dr1 = zero_ptr;
-            case 2:
-              dr2 = zero_ptr;
-            case 1:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            case 0:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            default:
-              break;
-          }
-        }
-#ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vbias] "w"(wbias),
-                         [mask1] "w"(vmask_rp1),
-                         [mask2] "w"(vmask_rp2),
-                         [zero] "w"(vzero),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
-        } else {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vbias] "w"(wbias),
-                         [mask1] "w"(vmask_rp1),
-                         [mask2] "w"(vmask_rp2),
-                         [zero] "w"(vzero),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
-        }
-#else
-        unsigned int *vmask_ptr = vmask;
-        float bias_val = flag_bias ? bias[i] : 0.f;
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [bias_val] "r"(bias_val),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [bias_val] "r"(bias_val),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-#endif
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc
index 02a49cf157296763ce3a61ea99dd4ce513dc2f30..96d0893bc0f0a1c145f4e58dd2caecfba78786ab 100644
--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -107,29 +107,35 @@ void im2col(const Dtype* data_im,
             int width,
             int kernel_h,
             int kernel_w,
-            int pad_h,
-            int pad_w,
+            int pad_top,
+            int pad_bottom,
+            int pad_left,
+            int pad_right,
             int stride_h,
             int stride_w,
             int dilation_h,
             int dilation_w,
             Dtype* data_col) {
   const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_top + pad_bottom - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
   const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_left + pad_right - (dilation_w * (kernel_w - 1) + 1)) /
+          stride_w +
+      1;
   const int channel_size = height * width;
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_top + kernel_row * dilation_h;
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             for (int output_cols = output_w; output_cols; output_cols--) {
               *(data_col++) = 0;
             }
           } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_left + kernel_col * dilation_w;
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 *(data_col++) = data_im[input_row * width + input_col];
@@ -174,13 +180,14 @@ void conv1x1s1_gemm(const float* i_data,
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
 
+  auto act_param = param.activation_param;
+
   int hblock = get_hblock(ctx);
   int m_roundup = hblock * ((m + hblock - 1) / hblock);
   int weights_size_per_group = m * k;
   if (n > 1) {
     weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
   }
-
   //! use gemv when the output channel size = 1
   for (int b = 0; b < num; ++b) {
     // dC
@@ -202,8 +209,11 @@ void conv1x1s1_gemm(const float* i_data,
               k,
               flag_bias,
               bias_group,
-              flag_relu,
-              ctx);
+              act_param.has_active,
+              act_param.active_type,
+              ctx,
+              act_param.Relu_clipped_coef,
+              act_param.Leaky_relu_alpha);
       } else {
         sgemm_prepack(false,
                       m,
@@ -217,7 +227,7 @@ void conv1x1s1_gemm(const float* i_data,
                       n,
                       bias_group,
                       flag_bias,
-                      flag_relu,
+                      act_param,
                       ctx);
       }
     }
@@ -355,6 +365,8 @@ void conv_im2col_gemm(const float* i_data,
   int hblock = get_hblock(ctx);
   int m_roundup = hblock * ((m + hblock - 1) / hblock);
   int weights_size_per_group = m * k;
+
+  auto act_param = param.activation_param;
   if (n > 1) {
     weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
   }
@@ -362,6 +374,8 @@ void conv_im2col_gemm(const float* i_data,
   float* tmp_work_space =
       ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   //! use gemv when the output channel size = 1
   for (int b = 0; b < num; ++b) {
     // dC
@@ -379,12 +393,14 @@ void conv_im2col_gemm(const float* i_data,
              win,
              kernel_h,
              kernel_w,
-             param.paddings[0],
-             param.paddings[1],
+             paddings[0],
+             paddings[1],
+             paddings[2],
+             paddings[3],
              param.strides[0],
              param.strides[1],
-             param.dilations[0],
-             param.dilations[1],
+             dilations[0],
+             dilations[1],
              dB);
 
       if (n == 1) {
@@ -396,8 +412,11 @@ void conv_im2col_gemm(const float* i_data,
               k,
               flag_bias,
               bias_group,
-              flag_relu,
-              ctx);
+              act_param.has_active,
+              act_param.active_type,
+              ctx,
+              act_param.Relu_clipped_coef,
+              act_param.Leaky_relu_alpha);
       } else {
         int ldb = n;
         sgemm_prepack(false,
@@ -412,7 +431,7 @@ void conv_im2col_gemm(const float* i_data,
                       n,
                       bias_group,
                       flag_bias,
-                      flag_relu,
+                      act_param,
                       ctx);
       }
     }
@@ -436,14 +455,16 @@ void conv_im2col_gemm_int8(const int8_t* i_data,
                            const float* scale) {
   int group = param.groups;
   auto filter_dims = param.filter->dims();
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int kernel_h = filter_dims[2];
   int kernel_w = filter_dims[3];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int dila_h = dilations[0];
+  int dila_w = dilations[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   const int m = oc / group;
   const int n = oh * ow;
   const int k = ic * kernel_h * kernel_w / group;
@@ -484,7 +505,9 @@ void conv_im2col_gemm_int8(const int8_t* i_data,
              kernel_h,
              kernel_w,
              pad_h,
+             paddings[1],
              pad_w,
+             paddings[3],
              stride_h,
              stride_w,
              dila_h,
@@ -564,90 +587,83 @@ void conv_depthwise_3x3_fp32(const void* din,
                              const operators::ConvParam& param,
                              ARMContext* ctx,
                              const float* scale) {
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
-  if (pad_w != pad_h) {
-    LOG(FATAL) << "fp32 depthwise conv3x3 pad_w: " << pad_w
-               << ", pad_h: " << pad_h << " must be equal";
-    return;
-  }
+  auto paddings = *param.paddings;
+  auto act_param = param.activation_param;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
   int stride = param.strides[1];
   int pad = pad_w;
-  bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
-  if (stride == 1 && pad < 2) {  // support pad = [0, 1]
-    conv_depthwise_3x3s1_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              pad,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
-  } else if (stride == 2 && pad < 2) {  // support pad = [0, 1]
-    conv_depthwise_3x3s2_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              pad,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
-  } else {
-    LOG(FATAL) << "fp32 depthwise conv3x3 stride: " << stride
-               << " or pad(<2): " << pad << " unsupported";
-  }
-#if 0
-  if (pad == 1) {
-    conv_depthwise_3x3p1_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              stride,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
-  } else if (pad == 0 && h_in > 2) {
-    conv_depthwise_3x3p0_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              stride,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
+  bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
+  if (stride == 1) {
+    if (pads_less && (pad_h == pad_w) && (pad < 2)) {  // support pad = [0, 1]
+      conv_depthwise_3x3s1_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                pad,
+                                flag_bias,
+                                act_param,
+                                ctx);
+    } else {
+      conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                param,
+                                act_param,
+                                ctx);
+    }
+  } else if (stride == 2) {
+    if (pads_less && pad_h == pad_w && (pad < 2)) {  // support pad = [0, 1]
+      conv_depthwise_3x3s2_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                pad,
+                                flag_bias,
+                                act_param,
+                                ctx);
+    } else {
+      conv_3x3s2_depthwise_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                param,
+                                act_param,
+                                ctx);
+    }
   } else {
-    LOG(FATAL) << "unsupport this type 3x3 dw conv";
+    LOG(FATAL) << "fp32 depthwise conv3x3 stride: " << stride << " unsupported";
   }
-#endif
 }
 
 void conv_depthwise_5x5_fp32(const void* din,
@@ -664,12 +680,15 @@ void conv_depthwise_5x5_fp32(const void* din,
                              const operators::ConvParam& param,
                              ARMContext* ctx,
                              const float* scale) {
-  int pad = param.paddings[1];
+  auto paddings = *param.paddings;
+  auto act_param = param.activation_param;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
   ctx->ExtendWorkspace((w_in + w_out) * sizeof(float));
-  if (pad == 2 && stride == 2) {
+  if (stride == 2) {
     conv_depthwise_5x5s2_fp32(reinterpret_cast<const float*>(din),
                               reinterpret_cast<float*>(dout),
                               num,
@@ -681,25 +700,25 @@ void conv_depthwise_5x5_fp32(const void* din,
                               w_in,
                               reinterpret_cast<const float*>(weights),
                               bias,
-                              pad,
-                              flag_bias,
-                              flag_relu,
+                              param,
+                              act_param,
                               ctx);
   } else if (stride == 1) {
-    conv_depthwise_5x5s1_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
+    conv_depthwise_5x5s1_fp32(reinterpret_cast<float*>(dout),
+                              reinterpret_cast<const float*>(din),
                               reinterpret_cast<const float*>(weights),
                               bias,
-                              pad,
                               flag_bias,
                               flag_relu,
+                              num,
+                              ch_in,
+                              h_in,
+                              w_in,
+                              h_out,
+                              w_out,
+                              pad_w,
+                              pad_h,
+                              param,
                               ctx);
   } else {
     LOG(FATAL) << "unsupport this type 5x5 dw conv";
@@ -720,8 +739,9 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -778,8 +798,9 @@ void conv_depthwise_3x3_int8_int8(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -836,8 +857,9 @@ void conv_depthwise_5x5_int8_fp32(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -858,6 +880,23 @@ void conv_depthwise_5x5_int8_fp32(const void* din,
                               pad_w,
                               pad_h,
                               ctx);
+  } else if (stride == 2) {
+    conv_depthwise_5x5s2_int8(reinterpret_cast<float*>(dout),
+                              reinterpret_cast<const int8_t*>(din),
+                              reinterpret_cast<const int8_t*>(weights),
+                              scale,
+                              bias,
+                              flag_bias,
+                              flag_relu,
+                              num,
+                              ch_in,
+                              h_in,
+                              w_in,
+                              h_out,
+                              w_out,
+                              pad_w,
+                              pad_h,
+                              ctx);
   } else {
     LOG(FATAL) << "unsupport this type 5x5 dw conv int8";
   }
@@ -877,8 +916,9 @@ void conv_depthwise_5x5_int8_int8(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -899,6 +939,23 @@ void conv_depthwise_5x5_int8_int8(const void* din,
                               pad_w,
                               pad_h,
                               ctx);
+  } else if (stride == 2) {
+    conv_depthwise_5x5s2_int8(reinterpret_cast<int8_t*>(dout),
+                              reinterpret_cast<const int8_t*>(din),
+                              reinterpret_cast<const int8_t*>(weights),
+                              scale,
+                              bias,
+                              flag_bias,
+                              flag_relu,
+                              num,
+                              ch_in,
+                              h_in,
+                              w_in,
+                              h_out,
+                              w_out,
+                              pad_w,
+                              pad_h,
+                              ctx);
   } else {
     LOG(FATAL) << "unsupport this type 5x5 dw conv int8";
   }
diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h
index c5baa31e1414c4a7a0c926728e5c150c0fc3e21c..60f74b7feecc91a2fe8262a1fea4dce26430031d 100644
--- a/lite/backends/arm/math/conv_impl.h
+++ b/lite/backends/arm/math/conv_impl.h
@@ -314,7 +314,51 @@ void fill_bias_int8(int* tensor,
                     const int* bias,
                     int channel,
                     int channel_size);
+// new winograd
 
+void weight_trans_c4_8x8(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+void weight_trans_c4_4x4(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+void conv_compute_6x6_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx);
+void conv_compute_2x2_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx);
+void conv_compute_2x2_3x3_small(const float* input,
+                                float* output,
+                                int num,
+                                int chout,
+                                int hout,
+                                int wout,
+                                int chin,
+                                int hin,
+                                int win,
+                                const float* weight,
+                                const float* bias,
+                                const operators::ConvParam& param,
+                                ARMContext* ctx);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/conv_winograd_3x3.cc b/lite/backends/arm/math/conv_winograd_3x3.cc
index 87b08f63102104b325e95c093fe0fc0aaef243e0..449c9e51db1e67b2a9f0d2d0f6ed0c2c2b2b2772 100644
--- a/lite/backends/arm/math/conv_winograd_3x3.cc
+++ b/lite/backends/arm/math/conv_winograd_3x3.cc
@@ -37,13 +37,15 @@ void conv_winograd3x3(const float* din,
                       const operators::ConvParam& param,
                       ARMContext* ctx) {
   int threads = ctx->threads();
-
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[1];
   int size_in_channel = win * hin;
   int size_out_channel = wout * hout;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
+  auto act_param = param.activation_param;
+  act_param.has_active = false;
 
   //! transform input
   int tile_w = (wout + 5) / 6;
@@ -127,7 +129,7 @@ void conv_winograd3x3(const float* din,
                     size_tile,
                     nullptr,
                     false,
-                    false,
+                    act_param,
                     ctx);
     }
 
diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc
index a4c61f9a9d181924c28cdd009f8412278d44f5bb..186ad19735799dcb91641354af4b4f09692bfce9 100644
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -557,6 +557,52 @@ void elementwise_mul<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_mul<int>(const int* dinx,
+                          const int* diny,
+                          int* dout,
+                          int num) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; ++i) {
+    const int* dinx_ptr = dinx + (i << 4);
+    const int* diny_ptr = diny + (i << 4);
+    int* dout_ptr = dout + (i << 4);
+
+    int32x4_t dinx0 = vld1q_s32(dinx_ptr);
+    int32x4_t dinx1 = vld1q_s32(dinx_ptr + 4);
+    int32x4_t dinx2 = vld1q_s32(dinx_ptr + 8);
+    int32x4_t dinx3 = vld1q_s32(dinx_ptr + 12);
+
+    int32x4_t diny0 = vld1q_s32(diny_ptr);
+    int32x4_t diny1 = vld1q_s32(diny_ptr + 4);
+    int32x4_t diny2 = vld1q_s32(diny_ptr + 8);
+    int32x4_t diny3 = vld1q_s32(diny_ptr + 12);
+
+    dinx0 = vmulq_s32(dinx0, diny0);
+    dinx1 = vmulq_s32(dinx1, diny1);
+    dinx2 = vmulq_s32(dinx2, diny2);
+    dinx3 = vmulq_s32(dinx3, diny3);
+
+    vst1q_s32(dout_ptr, dinx0);
+    vst1q_s32(dout_ptr + 4, dinx1);
+    vst1q_s32(dout_ptr + 8, dinx2);
+    vst1q_s32(dout_ptr + 12, dinx3);
+  }
+  if (remain > 0) {
+    const int* dinx_ptr = dinx + (cnt << 4);
+    const int* diny_ptr = diny + (cnt << 4);
+    int* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *dinx_ptr * *diny_ptr;
+      dout_ptr++;
+      dinx_ptr++;
+      diny_ptr++;
+    }
+  }
+}
+
 template <>
 void elementwise_mul_relu<float>(const float* dinx,
                                  const float* diny,
@@ -678,6 +724,73 @@ void elementwise_mul_broadcast<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_mul_broadcast<int>(const int* dinx,
+                                    const int* diny,
+                                    int* dout,
+                                    int batch,
+                                    int channels,
+                                    int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int* din_ptr = dinx + offset;
+      const int diny_data = diny[j];
+      int* dout_ptr = dout + offset;
+
+      int cnt = num >> 4;
+      int remain = num % 16;
+      int32x4_t rb = vdupq_n_s32(diny_data);
+      for (int k = 0; k < cnt; ++k) {
+        int32x4_t din0 = vld1q_s32(din_ptr);
+        int32x4_t din1 = vld1q_s32(din_ptr + 4);
+        int32x4_t din2 = vld1q_s32(din_ptr + 8);
+        int32x4_t din3 = vld1q_s32(din_ptr + 12);
+
+        din0 = vmulq_s32(din0, rb);
+        din1 = vmulq_s32(din1, rb);
+        din2 = vmulq_s32(din2, rb);
+        din3 = vmulq_s32(din3, rb);
+
+        vst1q_s32(dout_ptr, din0);
+        vst1q_s32(dout_ptr + 4, din1);
+        vst1q_s32(dout_ptr + 8, din2);
+        vst1q_s32(dout_ptr + 12, din3);
+
+        din_ptr += 16;
+        dout_ptr += 16;
+      }
+      if (remain >= 8) {
+        int32x4_t din0 = vld1q_s32(din_ptr);
+        int32x4_t din1 = vld1q_s32(din_ptr + 4);
+        din0 = vmulq_s32(din0, rb);
+        din1 = vmulq_s32(din1, rb);
+        vst1q_s32(dout_ptr, din0);
+        vst1q_s32(dout_ptr + 4, din1);
+        din_ptr += 8;
+        dout_ptr += 8;
+        remain -= 8;
+      }
+      if (remain >= 4) {
+        int32x4_t din0 = vld1q_s32(din_ptr);
+        din0 = vmulq_s32(din0, rb);
+        vst1q_s32(dout_ptr, din0);
+        din_ptr += 4;
+        dout_ptr += 4;
+        remain -= 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; ++p) {
+          *dout_ptr = *din_ptr * diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_mul_relu_broadcast<float>(const float* dinx,
                                            const float* diny,
diff --git a/lite/backends/arm/math/fill_bias_relu.cc b/lite/backends/arm/math/fill_bias_relu.cc
index 7137a0363ba42b9c6416c6f98b0d4a6b5a1687fb..d816c2f549c2c074a35885931a585ff51ae97f6f 100644
--- a/lite/backends/arm/math/fill_bias_relu.cc
+++ b/lite/backends/arm/math/fill_bias_relu.cc
@@ -115,7 +115,241 @@ void fill_bias_relu<int>(int* tensor,
     }
   }
 }
-
+#ifdef __aarch64__
+#define FILL_BIAS                                               \
+  "1:                               \n"                         \
+  "ld1 {v0.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v1.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v2.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v3.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "add v0.4s, v0.4s, %[vbias].4s    \n"                         \
+  "add v1.4s, v1.4s, %[vbias].4s    \n"                         \
+  "add v2.4s, v2.4s, %[vbias].4s    \n"                         \
+  "add v3.4s, v3.4s, %[vbias].4s    \n"
+#define FILL_RELU                                         \
+  "fmax v0.4s, v0.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v1.4s, v1.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v2.4s, v2.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v3.4s, v3.4s, %[vzero].4s   \n" /* vmaxq_f32() */
+#define FILL_RELU6                                       \
+  "fmin v0.4s, v0.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v1.4s, v1.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v2.4s, v2.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v3.4s, v3.4s, %[vsix].4s   \n" /* vmaxq_f32() */
+#define FILL_LEAKY_RELU                                  \
+  "fcmge v4.4s, v0.4s,  %[vzero].4s  \n" /* vcgeq_f32 */ \
+  "fmul v5.4s, v0.4s, %[vscale].4s   \n" /* vmulq_f32 */ \
+  "fcmge v6.4s, v1.4s,  %[vzero].4s  \n" /* vcgeq_f32 */ \
+  "fmul v7.4s, v1.4s, %[vscale].4s   \n" /* vmulq_f32 */ \
+  "fcmge v8.4s, v2.4s,  %[vzero].4s  \n" /* vcgeq_f32 */ \
+  "fmul v9.4s, v2.4s, %[vscale].4s   \n" /* vmulq_f32 */ \
+  "fcmge v10.4s, v3.4s,  %[vzero].4s \n" /* vcgeq_f32 */ \
+  "fmul v11.4s, v3.4s, %[vscale].4s  \n" /* vmulq_f32 */ \
+  "bif v0.16b, v5.16b, v4.16b        \n" /* choose*/     \
+  "bif v1.16b, v7.16b, v6.16b        \n" /* choose*/     \
+  "bif v2.16b, v9.16b, v8.16b        \n" /* choose*/     \
+  "bif v3.16b, v11.16b, v10.16b      \n" /* choose*/
+#define FILL_STORE                                       \
+  "subs %w[cnt], %w[cnt], #1                    \n"      \
+  "st1 {v0.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v1.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v2.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v3.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "bne  1b                                    \n"
+#else
+#define FILL_BIAS                                            \
+  "1:                               \n"                      \
+  "vld1.32 {d6-d7}, [%[din_ptr]]!   @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d8-d9}, [%[din_ptr]]!   @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d10-d11}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d12-d13}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
+  "vadd.f32 q3, q3, %q[vbias] @ add \n"                      \
+  "vadd.f32 q4, q4, %q[vbias] @ add \n"                      \
+  "vadd.f32 q5, q5, %q[vbias] @ add \n"                      \
+  "vadd.f32 q6, q6, %q[vbias] @ add \n"
+#define FILL_RELU                               \
+  "vmax.f32 q3, q3, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q4, q4, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q5, q5, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q6, q6, %q[vzero] @ vmaxq_f32() \n"
+#define FILL_RELU6                             \
+  "vmin.f32 q3, q3, %q[vsix] @ vminq_f32() \n" \
+  "vmin.f32 q4, q4, %q[vsix] @ vmaxq_f32() \n" \
+  "vmin.f32 q5, q5, %q[vsix] @ vmaxq_f32() \n" \
+  "vmin.f32 q6, q6, %q[vsix] @ vmaxq_f32() \n"
+#define FILL_LEAKY_RELU                          \
+  "vcge.f32 q7, q3, %q[vzero]   @ vcgeq_u32 \n"  \
+  "vmul.f32 q8, q3, %q[vscale]  @ vmulq_f32 \n"  \
+  "vcge.f32 q9, q4, %q[vzero]   @ vcgeq_u32 \n"  \
+  "vmul.f32 q10, q4, %q[vscale]  @ vmulq_f32 \n" \
+  "vcge.f32 q11, q5, %q[vzero]   @ vcgeq_u32 \n" \
+  "vmul.f32 q12, q5, %q[vscale]  @ vmulq_f32 \n" \
+  "vcge.f32 q13, q6, %q[vzero]   @ vcgeq_u32 \n" \
+  "vmul.f32 q14, q6, %q[vscale]  @ vmulq_f32 \n" \
+  "vbif q3, q8, q7               @ choose \n"    \
+  "vbif q4, q10, q9              @ choose \n"    \
+  "vbif q5, q12, q11             @ choose \n"    \
+  "vbif q6, q14, q13             @ choose \n"
+#define FILL_STORE                                          \
+  "subs %[cnt], #1                                \n"       \
+  "vst1.32 {d6-d7}, [%[dout_ptr]]!       @ vst1q_f32()  \n" \
+  "vst1.32 {d8-d9}, [%[dout_ptr]]!       @ vst1q_f32()  \n" \
+  "vst1.32 {d10-d11}, [%[dout_ptr]]!     @ vst1q_f32()  \n" \
+  "vst1.32 {d12-d13}, [%[dout_ptr]]!     @ vst1q_f32()  \n" \
+  "bne  1b                                    \n"
+#endif
+template <>
+void fill_bias_act<float>(float* tensor,
+                          const float* bias,
+                          int channel,
+                          int channel_size,
+                          bool flag_bias,
+                          const operators::ActivationParam* act_param) {
+  float* data = tensor;
+  int cnt = channel_size >> 4;
+  int remain = channel_size % 16;
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t vsix = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    for (int j = 0; j < channel; j++) {
+      float bias_data = flag_bias ? bias[j] : 0.f;
+      float* src = data + j * channel_size;
+      float* dst = data + j * channel_size;
+      float32x4_t vbias = vdupq_n_f32(bias_data);
+      if (cnt > 0) {
+        switch (act_param->active_type) {
+          case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+            asm volatile(
+                FILL_BIAS FILL_RELU FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vbias] "w"(vbias)
+                : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+            asm volatile(
+                FILL_BIAS FILL_RELU FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vbias] "w"(vbias)
+                : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+            break;
+          case lite_api::ActivationType::kRelu6:
+#ifdef __aarch64__
+            asm volatile(
+                FILL_BIAS FILL_RELU FILL_RELU6 FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vsix] "w"(vsix), [vbias] "w"(vbias)
+                : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+            asm volatile(
+                FILL_BIAS FILL_RELU FILL_RELU6 FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vsix] "w"(vsix), [vbias] "w"(vbias)
+                : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+            break;
+          case lite_api::ActivationType::kLeakyRelu:
+#ifdef __aarch64__
+            asm volatile(
+                FILL_BIAS FILL_LEAKY_RELU FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vscale] "w"(vscale), [vbias] "w"(vbias)
+                : "memory",
+                  "cc",
+                  "v0",
+                  "v1",
+                  "v2",
+                  "v3",
+                  "v4",
+                  "v5",
+                  "v6",
+                  "v7",
+                  "v8",
+                  "v9",
+                  "v10",
+                  "v11");
+#else
+            asm volatile(
+                FILL_BIAS FILL_LEAKY_RELU FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vscale] "w"(vscale), [vbias] "w"(vbias)
+                : "memory",
+                  "cc",
+                  "q3",
+                  "q4",
+                  "q5",
+                  "q6",
+                  "q7",
+                  "q8",
+                  "q9",
+                  "q10",
+                  "q11",
+                  "q12",
+                  "q13",
+                  "q14");
+#endif
+            break;
+          default:
+            LOG(FATAL) << "this act_type: "
+                       << static_cast<int>(act_param->active_type)
+                       << " fuse not support";
+        }
+      }
+      // remain
+      switch (act_param->active_type) {
+        case lite_api::ActivationType::kRelu:
+          for (int i = 0; i < remain; i++) {
+            *dst = *src >= 0.f ? *src : 0.f;
+            src++;
+            dst++;
+          }
+        case lite_api::ActivationType::kRelu6:
+          for (int i = 0; i < remain; i++) {
+            float tmp = *src >= 0.f ? *src : 0.f;
+            *dst = tmp <= act_param->Relu_clipped_coef
+                       ? tmp
+                       : act_param->Relu_clipped_coef;
+            src++;
+            dst++;
+          }
+        case lite_api::ActivationType::kLeakyRelu:
+          for (int i = 0; i < remain; i++) {
+            if (*src >= 0.f) {
+              *dst = *src;
+            } else {
+              *dst = *src * act_param->Leaky_relu_alpha;
+            }
+            src++;
+            dst++;
+          }
+          break;
+        default:
+          LOG(FATAL) << "this act_type: "
+                     << static_cast<int>(act_param->active_type)
+                     << " fuse not support";
+      }
+    }
+  } else {
+    for (int j = 0; j < channel; ++j) {
+      float bias_data = flag_bias ? bias[j] : 0.f;
+      float32x4_t vbias = vdupq_n_f32(bias_data);
+      float* src = data + j * channel_size;
+      float* dst = data + j * channel_size;
+#ifdef __aarch64__
+      asm volatile(FILL_BIAS FILL_STORE
+                   : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                   : [vbias] "w"(vbias)
+                   : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+      asm volatile(FILL_BIAS FILL_STORE
+                   : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                   : [vbias] "w"(vbias)
+                   : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+    }
+  }
+}
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/fill_bias_relu.h b/lite/backends/arm/math/fill_bias_relu.h
index 254d6d43be8aca8b17cb2fb2107977095facba51..ce775a96a13dad7fddac34e211fc19267a9d48fc 100644
--- a/lite/backends/arm/math/fill_bias_relu.h
+++ b/lite/backends/arm/math/fill_bias_relu.h
@@ -37,7 +37,22 @@ void fill_bias_relu(Dtype* tensor,
                     int channel_size,
                     bool flag_bias,
                     bool flag_relu);
-
+/**
+ *  * \brief neon implementation to add bias and activation(relu, relu6,
+ * leakyrelu)
+ *  * @param tensor
+ *  * @param bias
+ *  * @param channel
+ *  * @param channel_size
+ *
+ */
+template <typename Dtype>
+void fill_bias_act(Dtype* tensor,
+                   const Dtype* bias,
+                   int channel,
+                   int channel_size,
+                   bool flag_bias,
+                   const operators::ActivationParam* act_param);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/funcs.cc b/lite/backends/arm/math/funcs.cc
index e4425ade2efebdaad9136f75c39493f2bd3df4ca..8d20e5242e556c86a1283a64ff9ccf51e2efa247 100644
--- a/lite/backends/arm/math/funcs.cc
+++ b/lite/backends/arm/math/funcs.cc
@@ -21,128 +21,179 @@ namespace arm {
 namespace math {
 
 template <>
-void fill_bias_fc<float>(float *out, const float *bias, int num, int channel) {
+void fill_bias_fc<float>(
+    float *out, const float *bias, int num, int channel, bool flag_relu) {
   int cnt = channel >> 4;
   int remain = channel & 15;
-
-  for (int j = 0; j < num; ++j) {
-    const float *ptr_bias = bias;
-    float *ptr_out = out + j * channel;
-
-    float32x4_t vout1;
-    float32x4_t vout2;
-    float32x4_t vout3;
-    float32x4_t vout4;
-
-    for (int i = 0; i < cnt; ++i) {
-      float32x4_t vin1 = vld1q_f32(ptr_out);
-      float32x4_t vb1 = vld1q_f32(ptr_bias);
-
-      float32x4_t vin2 = vld1q_f32(ptr_out + 4);
-      float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
-
-      float32x4_t vin3 = vld1q_f32(ptr_out + 8);
-      float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
-
-      float32x4_t vin4 = vld1q_f32(ptr_out + 12);
-      float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
-
-      vout1 = vaddq_f32(vin1, vb1);
-      vout2 = vaddq_f32(vin2, vb2);
-      vout3 = vaddq_f32(vin3, vb3);
-      vout4 = vaddq_f32(vin4, vb4);
-
-      vst1q_f32(ptr_out, vout1);
-      vst1q_f32(ptr_out + 4, vout2);
-      vst1q_f32(ptr_out + 8, vout3);
-      vst1q_f32(ptr_out + 12, vout4);
-
-      ptr_out += 16;
-      ptr_bias += 16;
+  if (flag_relu) {
+    float32x4_t vzero = vdupq_n_f32(0.f);
+    for (int j = 0; j < num; ++j) {
+      const float *ptr_bias = bias;
+      float *ptr_out = out + j * channel;
+
+      for (int i = 0; i < cnt; ++i) {
+        float32x4_t vin1 = vld1q_f32(ptr_out);
+        float32x4_t vb1 = vld1q_f32(ptr_bias);
+
+        float32x4_t vin2 = vld1q_f32(ptr_out + 4);
+        float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
+
+        float32x4_t vin3 = vld1q_f32(ptr_out + 8);
+        float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
+
+        float32x4_t vin4 = vld1q_f32(ptr_out + 12);
+        float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
+
+        float32x4_t vout1 = vaddq_f32(vin1, vb1);
+        float32x4_t vout2 = vaddq_f32(vin2, vb2);
+        float32x4_t vout3 = vaddq_f32(vin3, vb3);
+        float32x4_t vout4 = vaddq_f32(vin4, vb4);
+
+        vout1 = vmaxq_f32(vout1, vzero);
+        vout2 = vmaxq_f32(vout2, vzero);
+        vout3 = vmaxq_f32(vout3, vzero);
+        vout4 = vmaxq_f32(vout4, vzero);
+
+        vst1q_f32(ptr_out, vout1);
+        vst1q_f32(ptr_out + 4, vout2);
+        vst1q_f32(ptr_out + 8, vout3);
+        vst1q_f32(ptr_out + 12, vout4);
+
+        ptr_out += 16;
+        ptr_bias += 16;
+      }
+      for (int i = 0; i < remain; ++i) {
+        *ptr_out += *(ptr_bias++);
+        *ptr_out = *ptr_out > 0.f ? *ptr_out : 0.f;
+        ptr_out++;
+      }
     }
-#if 0
-        if (cnt > 0) {
-            asm(
-            "1: \n"
-            "vld1.32 {d0-d1}, [%[ptr_out]]    @ load data\n"
-            "vld1.32 {d2-d3}, [%[ptr_bias]]!  @ load data\n"
-            "vadd.f32 q2, q0, q1              @ add bias\n"
-            "vst1.32  {d4-d5}, [%[ptr_out]]!  @ store result\n"
-            "subs   %[cnt], #1                @ loop count -1\n"
-            "bne    1b                        @ jump to main loop\n"
-            :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
-                    [cnt] "+r"(cnt)
-            :
-            :"q0", "q1", "q2"
-            );
-        }
-#endif
-    for (int i = 0; i < remain; ++i) {
-      *(ptr_out++) += *(ptr_bias++);
+  } else {
+    for (int j = 0; j < num; ++j) {
+      const float *ptr_bias = bias;
+      float *ptr_out = out + j * channel;
+
+      for (int i = 0; i < cnt; ++i) {
+        float32x4_t vin1 = vld1q_f32(ptr_out);
+        float32x4_t vb1 = vld1q_f32(ptr_bias);
+
+        float32x4_t vin2 = vld1q_f32(ptr_out + 4);
+        float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
+
+        float32x4_t vin3 = vld1q_f32(ptr_out + 8);
+        float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
+
+        float32x4_t vin4 = vld1q_f32(ptr_out + 12);
+        float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
+
+        float32x4_t vout1 = vaddq_f32(vin1, vb1);
+        float32x4_t vout2 = vaddq_f32(vin2, vb2);
+        float32x4_t vout3 = vaddq_f32(vin3, vb3);
+        float32x4_t vout4 = vaddq_f32(vin4, vb4);
+
+        vst1q_f32(ptr_out, vout1);
+        vst1q_f32(ptr_out + 4, vout2);
+        vst1q_f32(ptr_out + 8, vout3);
+        vst1q_f32(ptr_out + 12, vout4);
+
+        ptr_out += 16;
+        ptr_bias += 16;
+      }
+      for (int i = 0; i < remain; ++i) {
+        *(ptr_out++) += *(ptr_bias++);
+      }
     }
   }
 }
 
 template <>
-void fill_bias_fc<int>(int *out, const int *bias, int num, int channel) {
+void fill_bias_fc<int>(
+    int *out, const int *bias, int num, int channel, bool flag_relu) {
   int cnt = channel >> 4;
   int remain = channel & 15;
-
-  for (int j = 0; j < num; ++j) {
-    const int *ptr_bias = bias;
-    int *ptr_out = out + j * channel;
-
-    int32x4_t vout1;
-    int32x4_t vout2;
-    int32x4_t vout3;
-    int32x4_t vout4;
-
-    for (int i = 0; i < cnt; ++i) {
-      int32x4_t vin1 = vld1q_s32(ptr_out);
-      int32x4_t vb1 = vld1q_s32(ptr_bias);
-
-      int32x4_t vin2 = vld1q_s32(ptr_out + 4);
-      int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
-
-      int32x4_t vin3 = vld1q_s32(ptr_out + 8);
-      int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
-
-      int32x4_t vin4 = vld1q_s32(ptr_out + 12);
-      int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
-
-      vout1 = vaddq_s32(vin1, vb1);
-      vout2 = vaddq_s32(vin2, vb2);
-      vout3 = vaddq_s32(vin3, vb3);
-      vout4 = vaddq_s32(vin4, vb4);
-
-      vst1q_s32(ptr_out, vout1);
-      vst1q_s32(ptr_out + 4, vout2);
-      vst1q_s32(ptr_out + 8, vout3);
-      vst1q_s32(ptr_out + 12, vout4);
-
-      ptr_out += 16;
-      ptr_bias += 16;
-    }
-
-#if 0
-        if (cnt > 0) {
-        asm(
-        "1: \n"
-        "vld1.32 {d0-d1}, [%[ptr_out]]    @ load data\n"
-        "vld1.32 {d2-d3}, [%[ptr_bias]]!  @ load data\n"
-        "vadd.s32 q2, q0, q1              @ add bias\n"
-        "vst1.32  {d4-d5}, [%[ptr_out]]!  @ store result\n"
-        "subs   %[cnt], #1                @ loop count -1\n"
-        "bne    1b                        @ jump to main loop\n"
-        :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
-                [cnt] "+r"(cnt)
-        :
-        :"q0", "q1", "q2"
-        );
+  if (flag_relu) {
+    for (int j = 0; j < num; ++j) {
+      const int *ptr_bias = bias;
+      int *ptr_out = out + j * channel;
+
+      int32x4_t vzero = vdupq_n_s32(0);
+
+      for (int i = 0; i < cnt; ++i) {
+        int32x4_t vin1 = vld1q_s32(ptr_out);
+        int32x4_t vb1 = vld1q_s32(ptr_bias);
+
+        int32x4_t vin2 = vld1q_s32(ptr_out + 4);
+        int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
+
+        int32x4_t vin3 = vld1q_s32(ptr_out + 8);
+        int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
+
+        int32x4_t vin4 = vld1q_s32(ptr_out + 12);
+        int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
+
+        int32x4_t vout1 = vaddq_s32(vin1, vb1);
+        int32x4_t vout2 = vaddq_s32(vin2, vb2);
+        int32x4_t vout3 = vaddq_s32(vin3, vb3);
+        int32x4_t vout4 = vaddq_s32(vin4, vb4);
+
+        vout1 = vmaxq_s32(vout1, vzero);
+        vout2 = vmaxq_s32(vout2, vzero);
+        vout3 = vmaxq_s32(vout3, vzero);
+        vout4 = vmaxq_s32(vout4, vzero);
+
+        vst1q_s32(ptr_out, vout1);
+        vst1q_s32(ptr_out + 4, vout2);
+        vst1q_s32(ptr_out + 8, vout3);
+        vst1q_s32(ptr_out + 12, vout4);
+
+        ptr_out += 16;
+        ptr_bias += 16;
+      }
+      for (int i = 0; i < remain; ++i) {
+        *ptr_out += *(ptr_bias++);
+        *ptr_out = *ptr_out > 0 ? *ptr_out : 0;
+        ptr_out++;
+      }
     }
-#endif
-    for (int i = 0; i < remain; ++i) {
-      *(ptr_out++) += *(ptr_bias++);
+  } else {
+    for (int j = 0; j < num; ++j) {
+      const int *ptr_bias = bias;
+      int *ptr_out = out + j * channel;
+
+      int32x4_t vout1;
+      int32x4_t vout2;
+      int32x4_t vout3;
+      int32x4_t vout4;
+
+      for (int i = 0; i < cnt; ++i) {
+        int32x4_t vin1 = vld1q_s32(ptr_out);
+        int32x4_t vb1 = vld1q_s32(ptr_bias);
+
+        int32x4_t vin2 = vld1q_s32(ptr_out + 4);
+        int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
+
+        int32x4_t vin3 = vld1q_s32(ptr_out + 8);
+        int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
+
+        int32x4_t vin4 = vld1q_s32(ptr_out + 12);
+        int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
+
+        vout1 = vaddq_s32(vin1, vb1);
+        vout2 = vaddq_s32(vin2, vb2);
+        vout3 = vaddq_s32(vin3, vb3);
+        vout4 = vaddq_s32(vin4, vb4);
+
+        vst1q_s32(ptr_out, vout1);
+        vst1q_s32(ptr_out + 4, vout2);
+        vst1q_s32(ptr_out + 8, vout3);
+        vst1q_s32(ptr_out + 12, vout4);
+
+        ptr_out += 16;
+        ptr_bias += 16;
+      }
+      for (int i = 0; i < remain; ++i) {
+        *(ptr_out++) += *(ptr_bias++);
+      }
     }
   }
 }
diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h
index d8ef6ff47d0392ac15caf2d94b7c53ff63659da2..e975160c97b6e7396ab208805a4d685586ac00c8 100644
--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -39,16 +39,19 @@
 #include "lite/backends/arm/math/im2sequence.h"
 #include "lite/backends/arm/math/increment.h"
 #include "lite/backends/arm/math/interpolate.h"
+#include "lite/backends/arm/math/layout.h"
 #include "lite/backends/arm/math/lrn.h"
 #include "lite/backends/arm/math/negative.h"
 #include "lite/backends/arm/math/norm.h"
 #include "lite/backends/arm/math/packed_sgemm.h"
+#include "lite/backends/arm/math/packed_sgemm_c4.h"
 #include "lite/backends/arm/math/pad2d.h"
 #include "lite/backends/arm/math/pooling.h"
 #include "lite/backends/arm/math/power.h"
 #include "lite/backends/arm/math/prior_box.h"
 #include "lite/backends/arm/math/reduce_max.h"
 #include "lite/backends/arm/math/reduce_mean.h"
+#include "lite/backends/arm/math/reduce_prod.h"
 #include "lite/backends/arm/math/scale.h"
 #include "lite/backends/arm/math/sequence_expand.h"
 #include "lite/backends/arm/math/sequence_pool.h"
@@ -59,6 +62,7 @@
 #include "lite/backends/arm/math/slice.h"
 #include "lite/backends/arm/math/softmax.h"
 #include "lite/backends/arm/math/split.h"
+#include "lite/backends/arm/math/split_merge_lod_tenosr.h"
 #include "lite/backends/arm/math/stack.h"
 #include "lite/backends/arm/math/topk.h"
 #include "lite/backends/arm/math/yolo_box.h"
@@ -352,7 +356,8 @@ inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
 }
 
 template <typename T>
-void fill_bias_fc(T* tensor, const T* bias, int num, int channel);
+void fill_bias_fc(
+    T* tensor, const T* bias, int num, int channel, bool flag_relu);
 
 template <lite_api::ActivationType Act = lite_api::ActivationType::kIndentity>
 inline float32x4_t vactive_f32(const float32x4_t& x) {
diff --git a/lite/backends/arm/math/gru_utils.h b/lite/backends/arm/math/gru_utils.h
index 9bef1889b83d1e212c928562f777ba4706c3436a..9d57f81fc584b56ef5552b4fb2e079f3b62390e0 100644
--- a/lite/backends/arm/math/gru_utils.h
+++ b/lite/backends/arm/math/gru_utils.h
@@ -383,6 +383,8 @@ struct GRUUnitFunctor {
                       const lite_api::ActivationType active_gate,
                       bool origin_mode,
                       ARMContext* ctx) {
+    operators::ActivationParam act_param;
+    act_param.has_active = false;
     if (value.prev_out_value) {
       sgemm(false,
             false,
@@ -399,7 +401,7 @@ struct GRUUnitFunctor {
             frame_size * 3,
             nullptr,
             false,
-            false,
+            act_param,
             ctx);
     }
     gru_unit_reset_act(active_gate, value, frame_size, batch_size);
@@ -420,7 +422,7 @@ struct GRUUnitFunctor {
             frame_size * 3,
             nullptr,
             false,
-            false,
+            act_param,
             ctx);
     }
 
diff --git a/lite/backends/arm/math/interpolate.cc b/lite/backends/arm/math/interpolate.cc
index e9e18043dfc09001ebba23f952a59474630e54aa..1c53142fc53bc785efcbf28fa007d403ad99ab70 100644
--- a/lite/backends/arm/math/interpolate.cc
+++ b/lite/backends/arm/math/interpolate.cc
@@ -477,17 +477,23 @@ void nearest_interp(const float* src,
   float scale_h_new = (with_align)
                           ? (static_cast<float>(h_in - 1) / (h_out - 1))
                           : (static_cast<float>(h_in) / (h_out));
-
-#pragma omp parallel for collapse(2) schedule(static)
-  for (int h = 0; h < h_out; ++h) {
-    for (int w = 0; w < w_out; ++w) {
-      int near_x = (with_align) ? static_cast<int>(scale_w_new * w + 0.5)
-                                : static_cast<int>(scale_w_new * w);
-      int near_y = (with_align) ? static_cast<int>(scale_h_new * h + 0.5)
-                                : static_cast<int>(scale_h_new * h);
-      near_x = near_x < 0 ? 0 : near_x;
-      near_y = near_y < 0 ? 0 : near_y;
-      dst[h * w_out + w] = src[near_y * w_in + near_x];
+  if (with_align) {
+    for (int h = 0; h < h_out; ++h) {
+      float* dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h + 0.5);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w + 0.5);
+        *dst_p++ = src[near_y * w_in + near_x];
+      }
+    }
+  } else {
+    for (int h = 0; h < h_out; ++h) {
+      float* dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w);
+        *dst_p++ = src[near_y * w_in + near_x];
+      }
     }
   }
 }
@@ -520,9 +526,9 @@ void interpolate(lite::Tensor* X,
     }
     auto out_size = OutSize;
     if (out_size != nullptr) {
-      auto out_size_data = get_new_data_from_tensor<float>(out_size);
-      out_height = static_cast<int>(out_size_data[0]);
-      out_width = static_cast<int>(out_size_data[1]);
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_height = out_size_data[0];
+      out_width = out_size_data[1];
     }
   }
   float height_scale = scale;
@@ -544,8 +550,10 @@ void interpolate(lite::Tensor* X,
   int out_w = Out->dims()[3];
   int spatial_in = in_h * in_w;
   int spatial_out = out_h * out_w;
-  for (int i = 0; i < count; ++i) {
-    if ("Bilinear" == interpolate_type) {
+
+  if ("Bilinear" == interpolate_type) {
+#pragma omp parallel for
+    for (int i = 0; i < count; ++i) {
       bilinear_interp(din + spatial_in * i,
                       in_w,
                       in_h,
@@ -555,7 +563,10 @@ void interpolate(lite::Tensor* X,
                       1.f / width_scale,
                       1.f / height_scale,
                       with_align);
-    } else if ("Nearest" == interpolate_type) {
+    }
+  } else if ("Nearest" == interpolate_type) {
+#pragma omp parallel for
+    for (int i = 0; i < count; ++i) {
       nearest_interp(din + spatial_in * i,
                      in_w,
                      in_h,
diff --git a/lite/backends/arm/math/layout.cc b/lite/backends/arm/math/layout.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd9126ab48c8f829c82d0c78a338074c695f0b9c
--- /dev/null
+++ b/lite/backends/arm/math/layout.cc
@@ -0,0 +1,668 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/layout.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+#ifdef __aarch64__
+#define TRANS_C4                                                \
+  "ld1 {v0.4s}, [%[din0_ptr]]   \n"                             \
+  "ld1 {v1.4s}, [%[din1_ptr]]   \n"                             \
+  "ld1 {v2.4s}, [%[din2_ptr]]   \n"                             \
+  "ld1 {v3.4s}, [%[din3_ptr]]   \n"                             \
+                                                                \
+  "1: \n"                                                       \
+  "trn1 v4.4s, v0.4s, v1.4s \n" /*00 10 02 12 */                \
+  "trn1 v5.4s, v2.4s, v3.4s \n" /*20 30 22 32 */                \
+  "trn2 v6.4s, v0.4s, v1.4s \n" /*01 11 03 13 */                \
+  "trn2 v7.4s, v2.4s, v3.4s \n" /*21 31 23 33 */                \
+                                                                \
+  "add %[din0_ptr], %[din0_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride] \n" /* din+=c*size*/ \
+                                                                \
+  "trn1 v8.2d, v4.2d, v5.2d \n"  /*00 10 20 30 */               \
+  "trn1 v9.2d, v6.2d, v7.2d \n"  /*01 11 21 31 */               \
+  "trn2 v10.2d, v4.2d, v5.2d \n" /*02 12 22 32 */               \
+  "trn2 v11.2d, v6.2d, v7.2d \n" /*03 13 23 33 */               \
+                                                                \
+  "ld1 {v0.4s}, [%[din0_ptr]]   \n"                             \
+  "ld1 {v1.4s}, [%[din1_ptr]]   \n"                             \
+  "ld1 {v2.4s}, [%[din2_ptr]]   \n"                             \
+  "ld1 {v3.4s}, [%[din3_ptr]]   \n"                             \
+                                                                \
+  "subs %w[cnt], %w[cnt], #1 \n"                                \
+  "str q8, [%[out0_ptr]], #16 \n"                               \
+  "str q9, [%[out1_ptr]], #16 \n"                               \
+  "str q10, [%[out2_ptr]], #16 \n"                              \
+  "str q11, [%[out3_ptr]], #16 \n"                              \
+  "bne 1b \n"
+
+#define TRANS_C8                                                  \
+  "1: \n"                                                         \
+  "ld1 {v0.8b}, [%[din0_ptr]]   \n"                               \
+  "ld1 {v1.8b}, [%[din1_ptr]]   \n"                               \
+  "ld1 {v2.8b}, [%[din2_ptr]]   \n"                               \
+  "ld1 {v3.8b}, [%[din3_ptr]]   \n"                               \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "trn1 v8.8b, v0.8b, v1.8b \n"  /*00 10 02 12 04 14 06 16 */     \
+  "trn1 v9.8b, v2.8b, v3.8b \n"  /*20 30 22 32 */                 \
+  "trn2 v12.8b, v0.8b, v1.8b \n" /*01 11 03 13 05 15 07 17 */     \
+  "trn2 v13.8b, v2.8b, v3.8b \n" /*21 31 23 33 */                 \
+                                                                  \
+  "ld1 {v4.8b}, [%[din0_ptr]]   \n"                               \
+  "ld1 {v5.8b}, [%[din1_ptr]]   \n"                               \
+  "ld1 {v6.8b}, [%[din2_ptr]]   \n"                               \
+  "ld1 {v7.8b}, [%[din3_ptr]]   \n"                               \
+                                                                  \
+  "trn1 v10.8b, v4.8b, v5.8b \n" /*40 50 42 52 */                 \
+  "trn1 v11.8b, v6.8b, v7.8b \n" /*60 70 62 72 */                 \
+  "trn2 v14.8b, v4.8b, v5.8b \n" /*41 51 43 53 */                 \
+  "trn2 v15.8b, v6.8b, v7.8b \n" /*61 71 63 73 */                 \
+                                                                  \
+  "trn1 v0.4h, v8.4h, v9.4h \n"   /*00 10 20 30 04 14 24 34*/     \
+  "trn1 v2.4h, v12.4h, v13.4h \n" /*01 11 21 31 05 15 25 35*/     \
+  "trn1 v1.4h, v10.4h, v11.4h \n" /*40 50 60 70 44 54 64 74*/     \
+  "trn1 v3.4h, v14.4h, v15.4h \n" /*41 51 61 71 45 55 65 75*/     \
+                                                                  \
+  "trn2 v4.4h, v8.4h, v9.4h \n"   /*02 10 20 30 06 14 24 34*/     \
+  "trn2 v6.4h, v12.4h, v13.4h \n" /*03 11 21 31 07 15 25 35*/     \
+  "trn2 v5.4h, v10.4h, v11.4h \n" /*42 50 60 70 46 54 64 74*/     \
+  "trn2 v7.4h, v14.4h, v15.4h \n" /*43 51 61 71 47 55 65 75*/     \
+                                                                  \
+  "trn1 v8.2s, v0.2s, v1.2s \n"  /*00 10 20 30 40 50 60 70*/      \
+  "trn1 v9.2s, v2.2s, v3.2s \n"  /*01 11 21 31 41 51 61 71*/      \
+  "trn1 v10.2s, v4.2s, v5.2s \n" /*02 12 22 32 42 50 60 70*/      \
+  "trn1 v11.2s, v6.2s, v7.2s \n" /*03 13 23 33 41 51 61 71*/      \
+                                                                  \
+  "trn2 v12.2s, v0.2s, v1.2s \n" /*04 14 24 34 44 54 64 74*/      \
+  "trn2 v13.2s, v2.2s, v3.2s \n" /*05 15 25 35  45 55 65 75*/     \
+  "trn2 v14.2s, v4.2s, v5.2s \n" /*06 16 22 32 42 50 60 70*/      \
+  "trn2 v15.2s, v6.2s, v7.2s \n" /*07 17 23 33 41 51 61 71*/      \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "subs %w[cnt], %w[cnt], #1 \n"                                  \
+  "st1 {v8.8b}, [%[out0_ptr]], #8 \n"                             \
+  "st1 {v9.8b}, [%[out1_ptr]], #8 \n"                             \
+  "st1 {v10.8b}, [%[out2_ptr]], #8 \n"                            \
+  "st1 {v11.8b}, [%[out3_ptr]], #8 \n"                            \
+                                                                  \
+  "st1 {v11.8b}, [%[out4_ptr]], #8 \n"                            \
+  "st1 {v12.8b}, [%[out5_ptr]], #8 \n"                            \
+  "st1 {v13.8b}, [%[out6_ptr]], #8 \n"                            \
+  "st1 {v14.8b}, [%[out7_ptr]], #8 \n"                            \
+  "bne 1b \n"
+
+#else
+#define TRANS_C4                                                \
+  "1: \n"                                                       \
+  "vld1.32 {d0-d1}, [%[din0_ptr]] \n"                           \
+  "vld1.32 {d2-d3}, [%[din1_ptr]] \n"                           \
+  "vld1.32 {d4-d5}, [%[din2_ptr]] \n"                           \
+  "vld1.32 {d6-d7}, [%[din3_ptr]] \n"                           \
+                                                                \
+  "vtrn.32 q0, q1 \n" /*00 10 02 12 01 11 03 13*/               \
+  "vtrn.32 q2, q3 \n" /*20 30 22 32 21 31 23 33 */              \
+                                                                \
+  "add %[din0_ptr], %[din0_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride] \n" /* din+=c*size*/ \
+  "vswp d1, d4 \n"                                              \
+  "vswp d3, d6 \n"                                              \
+                                                                \
+  "subs %[cnt], %[cnt], #1 \n"                                  \
+  "vst1.32  {d0-d1}, [%[out0_ptr]]! \n"                         \
+  "vst1.32  {d2-d3}, [%[out1_ptr]]! \n"                         \
+  "vst1.32  {d4-d5}, [%[out2_ptr]]! \n"                         \
+  "vst1.32  {d6-d7}, [%[out3_ptr]]! \n"                         \
+  "bne 1b \n"
+
+#define TRANS_C8                                                  \
+  "1: \n"                                                         \
+  "vld1.8 d0, [%[din0_ptr]] \n"                                   \
+  "vld1.8 d1, [%[din1_ptr]] \n"                                   \
+  "vld1.8 d2, [%[din2_ptr]] \n"                                   \
+  "vld1.8 d3, [%[din3_ptr]] \n"                                   \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "vtrn.8 d0, d1 \n" /*00 10 02 12 04 14 06 16*/                  \
+  "vtrn.8 d2, d3 \n" /*20 30 22 32 24 34 26 36 */                 \
+                                                                  \
+  "vld1.8 d4, [%[din0_ptr]] \n"                                   \
+  "vld1.8 d5, [%[din1_ptr]] \n"                                   \
+  "vld1.8 d6, [%[din2_ptr]] \n"                                   \
+  "vld1.8 d7, [%[din3_ptr]] \n"                                   \
+                                                                  \
+  "vtrn.16 d0, d2 \n" /*00 10 20 30 04 14 24 34*/                 \
+  "vtrn.16 d1, d3 \n" /* 01 11 21 31 05 15 25 35 */               \
+  "vtrn.8 d4, d5 \n"  /*40 50 02 12 04 14 06 16*/                 \
+  "vtrn.8 d6, d7 \n"  /*60 70 22 32 24 34 26 36 */                \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "vtrn.16 d4, d6 \n" /*40 50 60 70 04 14 24 34*/                 \
+  "vtrn.16 d5, d7 \n" /* 41 51 61 71 05 15 25 35 */               \
+                                                                  \
+  "vtrn.32 d0, d4 \n" /*00 10 20 30 40 50 60 70*/                 \
+  "vtrn.32 d1, d5 \n" /* 01 11 21 31 41 51 61 71 */               \
+  "vtrn.32 d2, d6 \n" /*02 12 22 32 42 52 62 72*/                 \
+  "vtrn.32 d3, d7 \n" /* 03 11 21 33 43 53 63 73 */               \
+                                                                  \
+  "subs %[cnt], %[cnt], #1 \n"                                    \
+  "vst1.8  {d0}, [%[out0_ptr]]! \n"                               \
+  "vst1.8  {d1}, [%[out1_ptr]]! \n"                               \
+  "vst1.8  {d2}, [%[out2_ptr]]! \n"                               \
+  "vst1.8  {d3}, [%[out3_ptr]]! \n"                               \
+  "vst1.8  {d4}, [%[out4_ptr]]! \n"                               \
+  "vst1.8  {d5}, [%[out5_ptr]]! \n"                               \
+  "vst1.8  {d6}, [%[out6_ptr]]! \n"                               \
+  "vst1.8  {d7}, [%[out7_ptr]]! \n"                               \
+  "bne 1b \n"
+
+#endif
+template <>
+void NCHW2NHWC<float>(int N, int C, int size, const float* X, float* Y) {
+  int cnt = C >> 2;
+  int remain = C % 4;
+  int sum = C * size;
+  int stride = size << 4;  // 4 * size
+  int stride_w = stride >> 2;
+  for (int n = 0; n < N; n++) {
+    const float* din = X + n * sum;
+    float* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < size - 3; s += 4) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + size;
+      const float* din2_ptr = din1_ptr + size;
+      const float* din3_ptr = din2_ptr + size;
+      float* out0_ptr = dout + s * C;
+      float* out1_ptr = out0_ptr + C;
+      float* out2_ptr = out1_ptr + C;
+      float* out3_ptr = out2_ptr + C;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12");
+#else
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      for (int i = 0; i < remain; i++) {
+        const float* ptr = din0_ptr;
+        *out0_ptr++ = *ptr++;
+        *out1_ptr++ = *ptr++;
+        *out2_ptr++ = *ptr++;
+        *out3_ptr++ = *ptr++;
+        din0_ptr += size;
+      }
+    }
+    // remain size
+    for (; s < size; s++) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + size;
+      const float* din2_ptr = din1_ptr + size;
+      const float* din3_ptr = din2_ptr + size;
+      float* out0_ptr = dout + s * C;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        din0_ptr += stride_w;
+        din1_ptr += stride_w;
+        din2_ptr += stride_w;
+        din3_ptr += stride_w;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += size;
+      }
+    }
+  }
+}
+template <>
+void NCHW2NHWC<int8_t>(int N, int C, int size, const int8_t* X, int8_t* Y) {
+  int cnt = C >> 3;
+  int remain = C % 8;
+  int sum = C * size;
+  int stride = size << 3;    // 8 * size
+  int stride_w = size << 4;  // 4 * size * 4
+  for (int n = 0; n < N; n++) {
+    const int8_t* din = X + n * sum;
+    int8_t* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < size - 7; s += 8) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + size;
+      const int8_t* din2_ptr = din1_ptr + size;
+      const int8_t* din3_ptr = din2_ptr + size;
+      int8_t* out0_ptr = dout + s * C;
+      int8_t* out1_ptr = out0_ptr + C;
+      int8_t* out2_ptr = out1_ptr + C;
+      int8_t* out3_ptr = out2_ptr + C;
+      int8_t* out4_ptr = out3_ptr + C;
+      int8_t* out5_ptr = out4_ptr + C;
+      int8_t* out6_ptr = out5_ptr + C;
+      int8_t* out7_ptr = out6_ptr + C;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+#else
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      // const int8_t* din_ptr = din + 8 * cnt * size + s; // remain channel
+      for (int i = 0; i < remain; i++) {
+        const int8_t* ptr = din0_ptr;
+        *out0_ptr = *ptr++;
+        *out1_ptr = *ptr++;
+        *out2_ptr = *ptr++;
+        *out3_ptr = *ptr++;
+        din0_ptr += size;
+        *out4_ptr = *ptr++;
+        *out5_ptr = *ptr++;
+        *out6_ptr = *ptr++;
+        *out7_ptr = *ptr++;
+      }
+    }
+    // remain size
+    for (; s < size; s++) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + size;
+      const int8_t* din2_ptr = din1_ptr + size;
+      const int8_t* din3_ptr = din2_ptr + size;
+      const int8_t* din4_ptr = din3_ptr + size;
+      const int8_t* din5_ptr = din4_ptr + size;
+      const int8_t* din6_ptr = din5_ptr + size;
+      const int8_t* din7_ptr = din6_ptr + size;
+      int8_t* out0_ptr = dout + s * C;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        *out0_ptr++ = *din4_ptr;
+        *out0_ptr++ = *din5_ptr;
+        *out0_ptr++ = *din6_ptr;
+        *out0_ptr++ = *din7_ptr;
+        din0_ptr += stride;
+        din1_ptr += stride;
+        din2_ptr += stride;
+        din3_ptr += stride;
+        din4_ptr += stride;
+        din5_ptr += stride;
+        din6_ptr += stride;
+        din7_ptr += stride;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += size;
+      }
+    }
+  }
+}
+template <>
+void NHWC2NCHW<float>(int N, int C, int size, const float* X, float* Y) {
+  int cnt = size >> 2;
+  int remain = size % 4;
+  int sum = C * size;
+  int stride = C << 4;  // 4 * size
+  int stride_w = C << 2;
+  for (int n = 0; n < N; n++) {
+    const float* din = X + n * sum;
+    float* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < C - 3; s += 4) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + C;
+      const float* din2_ptr = din1_ptr + C;
+      const float* din3_ptr = din2_ptr + C;
+      float* out0_ptr = dout + s * size;
+      float* out1_ptr = out0_ptr + size;
+      float* out2_ptr = out1_ptr + size;
+      float* out3_ptr = out2_ptr + size;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11");
+#else
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      for (int i = 0; i < remain; i++) {
+        const float* ptr = din0_ptr;
+        *out0_ptr++ = *ptr++;
+        *out1_ptr++ = *ptr++;
+        *out2_ptr++ = *ptr++;
+        *out3_ptr++ = *ptr++;
+        din0_ptr += C;
+      }
+    }
+    // remain size
+    for (; s < C; s++) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + C;
+      const float* din2_ptr = din1_ptr + C;
+      const float* din3_ptr = din2_ptr + C;
+      float* out0_ptr = dout + s * size;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        din0_ptr += stride_w;
+        din1_ptr += stride_w;
+        din2_ptr += stride_w;
+        din3_ptr += stride_w;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += C;
+      }
+    }
+  }
+}
+template <>
+void NHWC2NCHW<int8_t>(int N, int C, int size, const int8_t* X, int8_t* Y) {
+  int cnt = size >> 3;
+  int remain = size % 8;
+  int sum = C * size;
+  int stride = C << 3;    // 8 * size
+  int stride_w = C << 4;  // 4 * size
+  for (int n = 0; n < N; n++) {
+    const int8_t* din = X + n * sum;
+    int8_t* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < C - 7; s += 8) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + C;
+      const int8_t* din2_ptr = din1_ptr + C;
+      const int8_t* din3_ptr = din2_ptr + C;
+      const int8_t* din4_ptr = din3_ptr + C;
+      const int8_t* din5_ptr = din4_ptr + C;
+      const int8_t* din6_ptr = din5_ptr + C;
+      const int8_t* din7_ptr = din6_ptr + C;
+      int8_t* out0_ptr = dout + s * size;
+      int8_t* out1_ptr = out0_ptr + size;
+      int8_t* out2_ptr = out1_ptr + size;
+      int8_t* out3_ptr = out2_ptr + size;
+      int8_t* out4_ptr = out3_ptr + size;
+      int8_t* out5_ptr = out4_ptr + size;
+      int8_t* out6_ptr = out5_ptr + size;
+      int8_t* out7_ptr = out6_ptr + size;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+#else
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      for (int i = 0; i < remain; i++) {
+        const int8_t* ptr = din0_ptr;
+        *out0_ptr++ = *ptr++;
+        *out1_ptr++ = *ptr++;
+        *out2_ptr++ = *ptr++;
+        *out3_ptr++ = *ptr++;
+        *out4_ptr++ = *ptr++;
+        *out5_ptr++ = *ptr++;
+        *out6_ptr++ = *ptr++;
+        *out7_ptr++ = *ptr++;
+        din0_ptr += C;
+      }
+    }
+    // remain size
+    for (; s < C; s++) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + C;
+      const int8_t* din2_ptr = din1_ptr + C;
+      const int8_t* din3_ptr = din2_ptr + C;
+      const int8_t* din4_ptr = din3_ptr + C;
+      const int8_t* din5_ptr = din4_ptr + C;
+      const int8_t* din6_ptr = din5_ptr + C;
+      const int8_t* din7_ptr = din6_ptr + C;
+      int8_t* out0_ptr = dout + s * size;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        *out0_ptr++ = *din4_ptr;
+        *out0_ptr++ = *din5_ptr;
+        *out0_ptr++ = *din6_ptr;
+        *out0_ptr++ = *din7_ptr;
+        din0_ptr += stride;
+        din1_ptr += stride;
+        din2_ptr += stride;
+        din3_ptr += stride;
+        din4_ptr += stride;
+        din5_ptr += stride;
+        din6_ptr += stride;
+        din7_ptr += stride;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += C;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/layout.h b/lite/backends/arm/math/layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed0e2f8b78a280c513161a02bb3b3b479008145a
--- /dev/null
+++ b/lite/backends/arm/math/layout.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+template <typename T>
+void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y);
+
+template <typename T>
+void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc
index 0d6eed9904902aa9539caf95172b0e4109e11f7d..cb9c049d81aee73b65bacd27a64138779d1532cc 100644
--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
@@ -14,6 +14,7 @@
 
 #include "lite/backends/arm/math/packed_sgemm.h"
 #include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
 
 namespace paddle {
 namespace lite {
@@ -51,8 +52,40 @@ void sgemm_prepacked_8x12(bool is_transB,
                           int ldc,
                           const float *bias,
                           bool has_bias,
-                          bool has_relu,
+                          const operators::ActivationParam act_param,
                           ARMContext *ctx);
+
+void pack_m4(float *out,
+             const float *in,
+             float alpha,
+             int ldin,
+             int m0,
+             int mmax,
+             int k0,
+             int kmax);
+
+void pack_trans_m4(float *out,
+                   const float *in,
+                   float alpha,
+                   int ldin,
+                   int m0,
+                   int mmax,
+                   int k0,
+                   int kmax);
+void sgemm_prepacked_4x4(bool is_transB,
+                         int M,
+                         int N,
+                         int K,
+                         const float *A_packed,
+                         const float *B,
+                         int ldb,
+                         float beta,
+                         float *C,
+                         int ldc,
+                         const float *bias,
+                         bool has_bias,
+                         const operators::ActivationParam act_param,
+                         ARMContext *ctx);
 #else
 // for kA72
 void prepackA_6x8(float *out,
@@ -104,7 +137,7 @@ void sgemm_prepacked_6x8(bool is_transB,
                          int ldc,
                          const float *bias,
                          bool has_bias,
-                         bool has_relu,
+                         const operators::ActivationParam act_param,
                          ARMContext *ctx);
 // for kA73, 4x8
 void sgemm_prepacked_4x8(bool is_transB,
@@ -119,7 +152,7 @@ void sgemm_prepacked_4x8(bool is_transB,
                          int ldc,
                          const float *bias,
                          bool has_bias,
-                         bool has_relu,
+                         const operators::ActivationParam act_param,
                          ARMContext *ctx);
 #endif  // __aarch64__
 
@@ -139,13 +172,21 @@ void prepackA(float *out,
               bool is_trans,
               ARMContext *ctx) {
 #ifdef __aarch64__
-  if (is_trans) {
-    prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+  if (mmax <= 4) {
+    if (is_trans) {
+      pack_trans_m4(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    } else {
+      pack_m4(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    }
   } else {
-    prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    if (is_trans) {
+      prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    } else {
+      prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    }
   }
 #else
-  if (ctx->arch() == kA73) {
+  if (ctx->arch() == kA73 || mmax <= 4) {
     if (is_trans) {
       prepackA_trans_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
     } else {
@@ -209,25 +250,42 @@ void sgemm_prepack(bool is_transB,
                    int ldc,
                    const float *bias,
                    bool has_bias,
-                   bool has_relu,
+                   const operators::ActivationParam act_param,
                    ARMContext *ctx) {
 #ifdef __aarch64__
-  sgemm_prepacked_8x12(is_transB,
-                       M,
-                       N,
-                       K,
-                       A_packed,
-                       B,
-                       ldb,
-                       beta,
-                       C,
-                       ldc,
-                       bias,
-                       has_bias,
-                       has_relu,
-                       ctx);
+  if (M <= 4) {
+    sgemm_prepacked_4x4(is_transB,
+                        M,
+                        N,
+                        K,
+                        A_packed,
+                        B,
+                        ldb,
+                        beta,
+                        C,
+                        ldc,
+                        bias,
+                        has_bias,
+                        act_param,
+                        ctx);
+  } else {
+    sgemm_prepacked_8x12(is_transB,
+                         M,
+                         N,
+                         K,
+                         A_packed,
+                         B,
+                         ldb,
+                         beta,
+                         C,
+                         ldc,
+                         bias,
+                         has_bias,
+                         act_param,
+                         ctx);
+  }
 #else   // armv7
-  if (ctx->arch() == kA73) {
+  if (ctx->arch() == kA73 || M <= 4) {
     sgemm_prepacked_4x8(is_transB,
                         M,
                         N,
@@ -240,7 +298,7 @@ void sgemm_prepack(bool is_transB,
                         ldc,
                         bias,
                         has_bias,
-                        has_relu,
+                        act_param,
                         ctx);
   } else {
     sgemm_prepacked_6x8(is_transB,
@@ -255,7 +313,7 @@ void sgemm_prepack(bool is_transB,
                         ldc,
                         bias,
                         has_bias,
-                        has_relu,
+                        act_param,
                         ctx);
   }
 #endif  // arm64
@@ -522,6 +580,147 @@ void prepackA_8x12(float *dout,
     }
   }
 }
+void pack_m4(float *dout,
+             const float *inptr,
+             float alpha,
+             int ldin,
+             int m0,
+             int mmax,
+             int k0,
+             int kmax) {
+  int x_len = kmax - k0;
+  int stride = x_len * 4;
+  float zerobuff[x_len];  // NOLINT
+  memset(zerobuff, 0, sizeof(float) * x_len);
+  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
+
+#pragma omp parallel for
+  for (int y = m0; y < mmax; y += 4) {
+    float *outptr = dout + stride * (y - m0) / 4;
+
+    const float *inptr0 = inptr + y * ldin + k0;
+    const float *inptr1 = inptr0 + ldin;
+    const float *inptr2 = inptr1 + ldin;
+    const float *inptr3 = inptr2 + ldin;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]        \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+
+    int x = x_len;
+    //! cope with row index exceed real size, set to zero buffer
+    if ((y + 3) >= mmax) {
+      switch ((y + 3) - mmax) {
+        case 2:
+          inptr1 = zerobuff;
+        case 1:
+          inptr2 = zerobuff;
+        case 0:
+          inptr3 = zerobuff;
+        default:
+          break;
+      }
+    }
+    for (; x > 7; x -= 8) {
+      asm volatile(
+          "cbz    %w[has_alpha], 0f\n"            /* check alpha == 1.f? */
+          "dup    v31.4s, %w[alpha]\n"            /* alpha to vector */
+          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
+          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
+          "fmul   v0.4s,  v31.4s, v0.4s\n"        /* mul alpha */
+          "fmul   v1.4s,  v31.4s, v1.4s\n"        /* mul alpha */
+          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
+          "fmul   v2.4s,  v31.4s, v2.4s\n"        /* mul alpha */
+          "fmul   v3.4s,  v31.4s, v3.4s\n"        /* mul alpha */
+          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
+          "fmul   v4.4s,  v31.4s, v4.4s\n"        /* mul alpha */
+          "fmul   v5.4s,  v31.4s, v5.4s\n"        /* mul alpha */
+          "fmul   v6.4s,  v31.4s, v6.4s\n"        /* mul alpha */
+          "fmul   v7.4s,  v31.4s, v7.4s\n"        /* mul alpha */
+          "b 1f\n"                                /* to main process */
+          "0: \n"                                 /* alpha == 1 */
+          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
+          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
+          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
+          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
+          "1: \n"                                 /* main process */
+          "trn1   v8.4s, v0.4s, v2.4s\n"          /* a0b0a2b2*/
+          "trn2   v9.4s, v0.4s, v2.4s\n"          /* a1b1a3b3*/
+          "trn1   v10.4s, v1.4s, v3.4s\n"         /* a4b4a6b6*/
+          "trn2   v11.4s, v1.4s, v3.4s\n"         /* a5b5a7b7*/
+
+          "trn1   v12.4s, v4.4s, v6.4s\n" /* c0d0c2d2*/
+          "trn2   v13.4s, v4.4s, v6.4s\n" /* c1d1c3d3*/
+          "trn1   v14.4s, v5.4s, v7.4s\n" /* c4d4c6d6*/
+          "trn2   v15.4s, v5.4s, v7.4s\n" /* c5d5c7d7*/
+
+          "trn1   v0.2d, v8.2d, v12.2d\n"  /* a0b0c0d0 */
+          "trn1   v1.2d, v9.2d, v13.2d\n"  /* a1b1c1d1 */
+          "trn1   v2.2d, v10.2d, v14.2d\n" /* a4b4c4d4 */
+          "trn1   v3.2d, v11.2d, v15.2d\n" /* a5b5c5d5 */
+
+          "trn2   v4.2d, v8.2d, v12.2d\n"     /* a2b2c2d2 */
+          "trn2   v5.2d, v9.2d, v13.2d\n"     /* a3b3c3d3 */
+          "stp    q0, q1, [%[outptr]], #32\n" /* save q0, q1, a0~h0*/
+          "trn2   v6.2d, v10.2d, v14.2d\n"    /* a6b6c6d6 */
+          "trn2   v7.2d, v11.2d, v15.2d\n"    /* a7b7c7d7 */
+          "stp    q4, q5, [%[outptr]], #32\n" /* save q2, q3, a1~h1*/
+          "stp    q2, q3, [%[outptr]], #32\n" /* save q4, q5, a2~h2*/
+          "stp    q6, q7, [%[outptr]], #32\n" /* save q6, q7, a3~h3*/
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr] "+r"(outptr)
+          : [alpha] "r"(alpha), [has_alpha] "r"(has_alpha)
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "cc",
+            "memory");
+    }
+
+    for (; x > 0; x--) {
+      if (has_alpha) {
+        *outptr++ = *inptr0++ * alpha;
+        *outptr++ = *inptr1++ * alpha;
+        *outptr++ = *inptr2++ * alpha;
+        *outptr++ = *inptr3++ * alpha;
+      } else {
+        *outptr++ = *inptr0++;
+        *outptr++ = *inptr1++;
+        *outptr++ = *inptr2++;
+        *outptr++ = *inptr3++;
+      }
+    }
+  }
+}
 
 void prepackA_trans_8x12(float *outptr,
                          const float *in,
@@ -682,6 +881,128 @@ void prepackA_trans_8x12(float *outptr,
     }
   }
 }
+void pack_trans_m4(float *outptr,
+                   const float *in,
+                   float alpha,
+                   int ldin,
+                   int m0,
+                   int mmax,
+                   int k0,
+                   int kmax) {
+  auto inptr = in + k0 * ldin + m0;
+  uint32_t mask_buffer[4] = {0, 1, 2, 3};
+  int x_len = mmax - m0;
+  int y_len = kmax - k0;
+  int right_remain = x_len - 4 * (x_len / 4);
+  int stride_out = 4 * y_len;
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask1 =
+      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
+
+  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
+  float32x4_t valpha = vdupq_n_f32(alpha);
+
+#pragma omp parallel for
+  for (int y = 0; y < y_len - 3; y += 4) {
+    const float *ptr0 = inptr + y * ldin;
+    const float *ptr1 = ptr0 + ldin;
+    const float *ptr2 = ptr1 + ldin;
+    const float *ptr3 = ptr2 + ldin;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]        \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
+        : "memory");
+
+    float *outptr_row_col = outptr + y * 4;
+    int i = 0;
+    for (; i < x_len - 3; i += 4) {
+      float32x4_t vr00 = vld1q_f32(ptr0);
+      float32x4_t vr10 = vld1q_f32(ptr1);
+      float32x4_t vr20 = vld1q_f32(ptr2);
+      float32x4_t vr30 = vld1q_f32(ptr3);
+      if (has_alpha) {
+        vr00 = vmulq_f32(vr00, valpha);
+        vr10 = vmulq_f32(vr10, valpha);
+        vr20 = vmulq_f32(vr20, valpha);
+        vr30 = vmulq_f32(vr30, valpha);
+      }
+
+      vst1q_f32(outptr_row_col, vr00);
+      vst1q_f32(outptr_row_col + 4, vr10);
+      vst1q_f32(outptr_row_col + 8, vr20);
+      vst1q_f32(outptr_row_col + 12, vr30);
+
+      ptr0 += 4;
+      ptr1 += 4;
+      ptr2 += 4;
+      ptr3 += 4;
+
+      outptr_row_col += stride_out;
+    }
+    if (right_remain > 0) {
+      float32x4_t vr00 = vld1q_f32(ptr0);
+      float32x4_t vr10 = vld1q_f32(ptr1);
+      float32x4_t vr20 = vld1q_f32(ptr2);
+      float32x4_t vr30 = vld1q_f32(ptr3);
+
+      if (has_alpha) {
+        vr00 = vmulq_f32(vr00, valpha);
+        vr10 = vmulq_f32(vr10, valpha);
+        vr20 = vmulq_f32(vr20, valpha);
+        vr30 = vmulq_f32(vr30, valpha);
+      }
+
+      float32x4_t vr00_1 = vbslq_f32(vmask1, vr00, vzero);
+      float32x4_t vr10_1 = vbslq_f32(vmask1, vr10, vzero);
+      float32x4_t vr20_1 = vbslq_f32(vmask1, vr20, vzero);
+      float32x4_t vr30_1 = vbslq_f32(vmask1, vr30, vzero);
+
+      vst1q_f32(outptr_row_col, vr00_1);
+      vst1q_f32(outptr_row_col + 4, vr10_1);
+      vst1q_f32(outptr_row_col + 8, vr20_1);
+      vst1q_f32(outptr_row_col + 12, vr30_1);
+    }
+  }
+
+#pragma omp parallel for
+  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
+    const float *ptr0 = inptr + y * ldin;
+    float *outptr_row_col = outptr + y * 4;
+    int i = 0;
+    for (; i < x_len - 3; i += 4) {
+      float32x4_t vr0 = vld1q_f32(ptr0);
+      if (has_alpha) {
+        vr0 = vmulq_f32(vr0, valpha);
+      }
+      vst1q_f32(outptr_row_col, vr0);
+
+      ptr0 += 4;
+
+      outptr_row_col += stride_out;
+    }
+    if (right_remain > 0) {
+      float32x4_t vr0 = vld1q_f32(ptr0);
+
+      if (has_alpha) {
+        vr0 = vmulq_f32(vr0, valpha);
+      }
+
+      float32x4_t vr0_1 = vbslq_f32(vmask1, vr0, vzero);
+
+      vst1q_f32(outptr_row_col, vr0_1);
+    }
+  }
+}
 
 #else  // __aarch64__
 void prepackA_6x8(float* outptr,
@@ -1963,7 +2284,7 @@ void sgemm_prepacked_8x12(bool is_transB,
                           int ldc,
                           const float *bias,
                           bool has_bias,
-                          bool has_relu,
+                          const operators::ActivationParam act_param,
                           ARMContext *ctx) {
   size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
   auto workspace = ctx->workspace_data<float>();
@@ -2517,33 +2838,6 @@ void sgemm_prepacked_8x12(bool is_transB,
             "fmla	v28.4s,  v4.4s,  v1.s[2]\n"   /* out22 = b2 * a10[0], b2 =q7*/
             "fmla	v31.4s,  v4.4s,  v1.s[3]\n"   /* out23 = b2 * a10[0], b2 =q7*/
             "11: \n"                            /* check if relu */
-            "cbz    %w[relu],   12f\n"          /* skip relu */
-            "movi   v2.4s, #0\n"                /* for relu*/
-            "fmax   v8.4s, v8.4s, v2.4s\n"      /* relu*/
-            "fmax   v9.4s, v9.4s, v2.4s\n"      /* relu*/
-            "fmax   v10.4s, v10.4s, v2.4s\n"    /* relu*/
-            "fmax   v11.4s, v11.4s, v2.4s\n"    /* relu*/
-            "fmax   v12.4s, v12.4s, v2.4s\n"    /* relu*/
-            "fmax   v13.4s, v13.4s, v2.4s\n"    /* relu*/
-            "fmax   v14.4s, v14.4s, v2.4s\n"    /* relu*/
-            "fmax   v15.4s, v15.4s, v2.4s\n"    /* relu*/
-            "fmax   v16.4s,v16.4s,v2.4s\n"      /* relu*/
-            "fmax   v17.4s,v17.4s,v2.4s\n"      /* relu*/
-            "fmax   v18.4s, v18.4s, v2.4s\n"    /* relu*/
-            "fmax   v19.4s, v19.4s, v2.4s\n"    /* relu*/
-            "fmax   v20.4s, v20.4s, v2.4s\n"    /* relu*/
-            "fmax   v21.4s, v21.4s, v2.4s\n"    /* relu*/
-            "fmax   v22.4s, v22.4s, v2.4s\n"    /* relu*/
-            "fmax   v23.4s, v23.4s, v2.4s\n"    /* relu*/
-            "fmax   v24.4s,v24.4s,v2.4s\n"      /* relu*/
-            "fmax   v25.4s,v25.4s,v2.4s\n"      /* relu*/
-            "fmax   v26.4s, v26.4s, v2.4s\n"    /* relu*/
-            "fmax   v27.4s, v27.4s, v2.4s\n"    /* relu*/
-            "fmax   v28.4s, v28.4s, v2.4s\n"    /* relu*/
-            "fmax   v29.4s, v29.4s, v2.4s\n"    /* relu*/
-            "fmax   v30.4s, v30.4s, v2.4s\n"    /* relu*/
-            "fmax   v31.4s, v31.4s, v2.4s\n"    /* relu*/
-            "12: \n"
             "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n"   /* store r0 */
             "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */
             "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */
@@ -2566,7 +2860,6 @@ void sgemm_prepacked_8x12(bool is_transB,
               [c_ptr6] "+r"(c_ptr6),
               [c_ptr7] "+r"(c_ptr7)
             : [bias_ptr] "r"(bias_local),
-              [relu] "r"(has_relu),
               [has_beta] "r"(has_beta),
               [beta] "r"(beta)
             : "cc","memory",
@@ -2591,6 +2884,298 @@ void sgemm_prepacked_8x12(bool is_transB,
       }
     }
   }
+  if (act_param.has_active) {
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int x = 0; x < M; x++) {
+      float *dst = C + x * ldc;
+      act_switch_process(dst, dst, N, &act_param);
+    }
+  }
+}
+
+void sgemm_prepacked_4x4(bool is_transB,
+                         int M,
+                         int N,
+                         int K,
+                         const float *A_packed,
+                         const float *B,
+                         int ldb,
+                         float beta,
+                         float *C,
+                         int ldc,
+                         const float *bias,
+                         bool has_bias,
+                         const operators::ActivationParam act_param,
+                         ARMContext *ctx) {
+  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
+  auto workspace = ctx->workspace_data<float>();
+  int threads = ctx->threads();
+
+  const int n_block = 4;
+  const int m_block = 4;
+  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
+  int x_block = (l2_cache - (m_block * K)) / (sizeof(float) * (K + m_block));
+  x_block /= n_block;
+  x_block *= n_block;
+  int x_num = (N + (x_block - 1)) / x_block;
+  x_block = (N + x_num - 1) / x_num;
+  x_block = (x_block + n_block - 1) / n_block;
+  x_block *= n_block;
+  x_block = x_block < n_block ? n_block : x_block;
+
+  // unroll 2 loop
+  int tail_pre = (K & (KBLOCK - 1));
+  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
+  if (tail_pre == 0) {
+    tail_pre = KBLOCK;
+  }
+
+  bool flag_p_remain = false;
+  int remain = 0;
+
+  int has_beta = fabsf(beta) > 1e-8f ? 1 : 0;
+  //! apanel is pre_compute outside gemm
+  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
+    unsigned int xmax = x0 + x_block;
+    if (xmax > N) {
+      xmax = N;
+    }
+    int bblocks = (xmax - x0 + n_block - 1) / n_block;
+    remain = xmax - x0 - (bblocks - 1) * n_block;
+    if (remain > 0) {
+      flag_p_remain = true;
+    }
+    //! load bpanel
+    float *b_pannel = workspace;
+    if (is_transB) {
+      pack_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K);
+    } else {
+      pack_trans_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K);
+    }
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int y = 0; y < M; y += m_block) {
+      unsigned int ymax = y + m_block;
+      if (ymax > M) {
+        ymax = M;
+      }
+
+      float bias_local[4] = {0};
+      if (has_bias) {
+        bias_local[0] = bias[y];
+        bias_local[1] = bias[y + 1];
+        bias_local[2] = bias[y + 2];
+        bias_local[3] = bias[y + 3];
+      }
+
+      float cout0[n_block];  // NOLINT
+      float cout1[n_block];  // NOLINT
+      float cout2[n_block];  // NOLINT
+      float cout3[n_block];  // NOLINT
+
+      float *c_ptr0 = C + y * ldc + x0;
+      float *c_ptr1 = c_ptr0 + ldc;
+      float *c_ptr2 = c_ptr1 + ldc;
+      float *c_ptr3 = c_ptr2 + ldc;
+
+      float *pout0 = c_ptr0;
+      float *pout1 = c_ptr1;
+      float *pout2 = c_ptr2;
+      float *pout3 = c_ptr3;
+
+      const float *a_ptr_l = A_packed + y * K;
+      const float *b_ptr_l = b_pannel;
+      for (int xb = 0; xb < bblocks; xb++) {
+        if ((y + 3) >= ymax) {
+          switch ((y + 3) - ymax) {
+            case 2:
+              c_ptr1 = cout1;
+            case 1:
+              c_ptr2 = cout2;
+            case 0:
+              c_ptr3 = cout3;
+            default:
+              break;
+          }
+        }
+        if (flag_p_remain && (xb == bblocks - 1)) {
+          pout0 = c_ptr0;
+          pout1 = c_ptr1;
+          pout2 = c_ptr2;
+          pout3 = c_ptr3;
+
+          c_ptr0 = cout0;
+          c_ptr1 = cout1;
+          c_ptr2 = cout2;
+          c_ptr3 = cout3;
+          if (has_beta) {
+            for (int i = 0; i < remain; ++i) {
+              cout0[i] = pout0[i];
+              cout1[i] = pout1[i];
+              cout2[i] = pout2[i];
+              cout3[i] = pout3[i];
+            }
+          }
+        }
+        const float *a_ptr = a_ptr_l;
+        const float *b_ptr = b_ptr_l + xb * K * 4;
+        int tail = tail_pre;
+        int k = k_pre;
+        // clang-format off
+        asm volatile(
+            "prfm   pldl1keep, [%[a_ptr]]\n"       /* preload a*/
+            "ld1    {v2.4s}, [%[bias_ptr]]\n"         /* load bias to q2, q3*/
+            "dup    v8.4s,  v2.s[0]\n"               /* out0 = 0 */
+            "prfm   pldl1keep, [%[b_ptr]]\n"       /* preload b*/
+            "dup    v9.4s,  v2.s[1]\n"               /* out1 = 0*/
+            "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/
+            "dup    v10.4s, v2.s[2]\n"               /* out2 = 0*/
+            "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/
+            "dup    v11.4s, v2.s[3]\n"               /* out3 = 0*/
+            "cbz    %w[has_beta], 0f\n"            /* check beta == 0? */
+            /* process beta */
+            "dup    v7.4s, %w[beta]\n"                    /* beta to vector */
+            "ld1    {v0.4s}, [%[c_ptr0]]\n" /* load output r0 */
+            "ld1    {v1.4s}, [%[c_ptr1]]\n" /* load output r1 */
+            "fmla   v8.4s, v0.4s, v7.4s\n"  /* cr00 += beta * c_r00*/
+            "fmla   v9.4s, v1.4s, v7.4s\n"  /* cr10 += beta * c_r10*/
+            "ld1    {v2.4s}, [%[c_ptr2]]\n"
+            "ld1    {v3.4s}, [%[c_ptr3]]\n"
+            "fmla   v10.4s, v2.4s, v7.4s\n" /* cr20 += beta * c_r20*/
+            "fmla   v11.4s, v3.4s, v7.4s\n" /* cr30 += beta * c_r30*/
+
+            "0: \n"                          /* check loop count */
+            "ldp	q0, q1, [%[a_ptr]], #32\n" /* load a00,a10 to q0, q1*/
+            "ldp	q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/
+            "cbz	%w[k], 2f\n"               /* check loop count > 0 */
+            /* main loop */
+            /* unrool 0*/
+            "1:\n"                              /* main loop */
+            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
+            "fmla  	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
+            "ldp	q6, q7, [%[b_ptr]], #32\n"    /* load b2, b3 to q6, q7 */
+            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
+            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */
+            
+            "ldp	q2, q3, [%[a_ptr]], #32\n"    /* load a20, a30 to q2, q3 */
+            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
+            "fmla	v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5 */
+            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
+            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */
+            "ldp	q4, q5, [%[b_ptr]], #32\n"    /* load b0, b1 to q4, q5*/
+
+            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b2 =q6 */
+            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b2 =q6 */
+            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b2 =q6*/
+            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b2 =q6*/
+            "ldp	q0, q1, [%[a_ptr]], #32\n"    /* load a00, a10 to q0, q1 */
+
+            "fmla	v8.4s,   v7.4s,  v3.s[0]\n"   /* out0 = b3 * a30[0], b3 =q7*/
+            "fmla	v9.4s,   v7.4s,  v3.s[1]\n"   /* out1 = b3 * a30[1], b3 =q7*/
+            "subs	%w[k], %w[k], #1\n"         /* loop count - 1*/
+            "fmla	v10.4s,  v7.4s,  v3.s[2]\n"   /* out2 = b3 * a30[2], b3 =q7*/
+            "fmla	v11.4s,  v7.4s,  v3.s[3]\n"   /* out3 = b3 * a30[3], b3 =q7*/
+
+            "bne	1b\n"
+            "2:\n"                            /* process tail*/
+            "subs		%w[tail], %w[tail], #1\n" /* tail--*/
+            "beq		3f\n"                     /*jump to tail = 1*/
+            /* final unrool 0*/
+            /* unrool 0, tail > 1*/
+            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
+            "fmla	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
+            "subs	%w[tail], %w[tail], #1\n"      /* tail--*/
+            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
+            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */
+
+            "beq		4f\n"                     /*jump to tail = 2*/
+            /* unrool 1, tail > 2*/
+            "ldp	q6, q7, [%[b_ptr]], #32\n"    /* load b2, b3 to q6, q7 */
+
+            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
+            "fmla       v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5*/
+            "subs	%w[tail], %w[tail], #1\n"      /* tail--*/
+            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
+            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */
+            "ldp	q2, q3, [%[a_ptr]], #32\n"    /* load a20, a30 to q2, q3 */
+
+            "beq		5f\n"                       /*jump to tail = 3*/
+            /* unrool 2, tail = 4*/
+            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b1 =q6 */
+            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b1 =q6 */
+            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b1 =q6*/
+            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b1 =q6*/
+
+            /* unrool 3, tail = 4*/
+
+            "fmla	v8.4s,   v7.4s,  v3.s[0]\n"   /* out0 = b3 * a30[0], b3 =q7*/
+            "fmla	v9.4s,   v7.4s,  v3.s[1]\n"   /* out1 = b3 * a30[1], b3 =q7*/
+            "fmla	v10.4s,  v7.4s,  v3.s[2]\n"   /* out2 = b3 * a30[2], b3 =q7*/
+            "fmla	v11.4s,  v7.4s,  v3.s[3]\n"   /* out3 = b3 * a30[3], b3 =q7*/
+
+            "b		11f\n"
+            /* tails==1 final tail*/
+            "3: \n"                            /* tail=1*/
+            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
+            "fmla	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
+            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
+            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */
+
+            "b		11f\n"
+            /* tails==2 final tail*/
+            "4:\n"                              /* tail = 2*/
+
+            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
+            "fmla       v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5*/
+            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
+            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */
+
+            "b		11f\n"
+            /* tails==3 final tail*/
+            "5:\n"                              /* tail = 3*/
+            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b1 =q6 */
+            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b1 =q6 */
+            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b1 =q6*/
+            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b1 =q6*/
+
+            "11: \n"                            /* check if relu */
+            "st1 {v8.4s}, [%[c_ptr0]], #16\n"   /* store r0 */
+            "st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */
+            "st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */
+            "st1 {v11.4s}, [%[c_ptr3]], #16\n" /* store r3 */
+
+            : [a_ptr] "+r"(a_ptr),
+              [b_ptr] "+r"(b_ptr),
+              [k] "+r"(k),
+              [tail] "+r"(tail),
+              [c_ptr0] "+r"(c_ptr0),
+              [c_ptr1] "+r"(c_ptr1),
+              [c_ptr2] "+r"(c_ptr2),
+              [c_ptr3] "+r"(c_ptr3)
+            : [bias_ptr] "r"(bias_local),
+              [has_beta] "r"(has_beta),
+              [beta] "r"(beta)
+            : "cc","memory",
+              "v0","v1","v2","v3","v4","v5","v6","v7",
+              "v8","v9","v10","v11");
+        // clang-format on
+        if (flag_p_remain && (xb == bblocks - 1)) {
+          for (int i = 0; i < remain; ++i) {
+            *pout0++ = cout0[i];
+            *pout1++ = cout1[i];
+            *pout2++ = cout2[i];
+            *pout3++ = cout3[i];
+          }
+        }
+      }
+    }
+  }
+  if (act_param.has_active) {
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int x = 0; x < M; x++) {
+      float *dst = C + x * ldc;
+      act_switch_process(dst, dst, N, &act_param);
+    }
+  }
 }
 #else  // __aarch64__
 /**
@@ -2616,7 +3201,7 @@ void sgemm_prepacked_6x8(bool is_transB,
                          int ldc,
                          const float* bias,
                          bool has_bias,
-                         bool has_relu,
+                         const operators::ActivationParam act_param,
                          ARMContext* ctx) {
   size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
   auto* workspace = ctx->workspace_data<float>();
@@ -2995,22 +3580,6 @@ void sgemm_prepacked_6x8(bool is_transB,
             "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
             "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
             "2:                                     @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q4, q4, q0                  @ for relu\n"
-            "vmax.f32   q5, q5, q0                  @ for relu\n"
-            "vmax.f32   q6, q6, q0                  @ for relu\n"
-            "vmax.f32   q7, q7, q0                  @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
             "vst1.32    {d8-d11},   [%[c_ptr0]]!    @ store r0\n"
             "vst1.32    {d12-d15},  [%[c_ptr1]]!    @ store r1\n"
             "vst1.32    {d16-d19},  [%[c_ptr2]]!    @ store r2\n"
@@ -3028,7 +3597,6 @@ void sgemm_prepacked_6x8(bool is_transB,
               [k] "+r"(k),
               [tails] "+r"(tails)
             : [bias_ptr] "r"(bias_local),
-              [relu] "r"(has_relu),
               [beta] "r"(beta)
             : "q0","q1","q2","q3","q4",
               "q5","q6","q7","q8","q9","q10","q11",
@@ -3048,6 +3616,13 @@ void sgemm_prepacked_6x8(bool is_transB,
       }
     }
   }
+  if (act_param.has_active) {
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int x = 0; x < M; x++) {
+      float* dst = C + x * ldc;
+      act_switch_process(dst, dst, N, &act_param);
+    }
+  }
 }
 
 void sgemm_prepacked_4x8(bool is_transB,
@@ -3062,7 +3637,7 @@ void sgemm_prepacked_4x8(bool is_transB,
                          int ldc,
                          const float* bias,
                          bool has_bias,
-                         bool has_relu,
+                         const operators::ActivationParam act_param,
                          ARMContext* ctx) {
   size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
   auto* workspace = ctx->workspace_data<float>();
@@ -3347,18 +3922,6 @@ void sgemm_prepacked_4x8(bool is_transB,
             /*aptr - 16*/
             "sub		%[a_ptr], %[a_ptr], #16     @ tail--\n"
             "2:                                     @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
             "vst1.32    {d16-d19},  [%[c_ptr0]]!    @ store r0\n"
             "vst1.32    {d20-d23},  [%[c_ptr1]]!    @ store r1\n"
             "vst1.32    {d24-d27},  [%[c_ptr2]]!    @ store r2\n"
@@ -3372,7 +3935,6 @@ void sgemm_prepacked_4x8(bool is_transB,
               [k] "+r"(k),
               [tails] "+r"(tails)
             : [bias_ptr] "r"(bias_local),
-              [relu] "r"(has_relu),
               [beta] "r"(beta)
             : "q0","q1","q2","q3",
               "q4","q5","q6","q7","q8","q9","q10",
@@ -3389,6 +3951,13 @@ void sgemm_prepacked_4x8(bool is_transB,
       }
     }
   }
+  if (act_param.has_active) {
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int x = 0; x < M; x++) {
+      float* dst = C + x * ldc;
+      act_switch_process(dst, dst, N, &act_param);
+    }
+  }
 }
 #endif  // __aarch64__
 
diff --git a/lite/backends/arm/math/packed_sgemm.h b/lite/backends/arm/math/packed_sgemm.h
index 6c14cdb2ef62558a53c765719107d68da678246b..bc23e8eab7b972fef77fda2360ae8f12c2e5d0e3 100644
--- a/lite/backends/arm/math/packed_sgemm.h
+++ b/lite/backends/arm/math/packed_sgemm.h
@@ -17,6 +17,7 @@
 #include <cmath>
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
 
 namespace paddle {
 namespace lite {
@@ -74,7 +75,7 @@ void sgemm_prepack(bool is_transB,
                    int ldc,
                    const float* bias,
                    bool has_bias,
-                   bool has_relu,
+                   const operators::ActivationParam act_param,
                    ARMContext* ctx);
 
 }  // namespace math
diff --git a/lite/backends/arm/math/packed_sgemm_c4.cc b/lite/backends/arm/math/packed_sgemm_c4.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af4934e85756f03ec197520b2b5c130e27bdcad6
--- /dev/null
+++ b/lite/backends/arm/math/packed_sgemm_c4.cc
@@ -0,0 +1,1704 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/packed_sgemm_c4.h"
+#include <arm_neon.h>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void loadb_c4(float* out,
+              const float* in,
+              const int xstart,
+              const int xend,
+              const int k_round,
+              const int n) {
+  const int xlen = (xend - xstart + NBLOCK_C4 - 1) / NBLOCK_C4 * NBLOCK_C4;
+  int xloop = xlen / NBLOCK_C4;
+  const int flag_remain = n < xstart + xlen;
+  int remain = 0;
+  int remain4 = 0;
+  int remain1 = 0;
+  if (flag_remain) {
+    remain = (n - xstart) - (xloop - 1) * NBLOCK_C4;
+    remain4 = remain >> 2;
+    remain1 = remain & 3;
+    xloop -= 1;
+  }
+  const int ldo = NBLOCK_C4 * k_round;
+  const int kloop = k_round >> 2;
+  in += xstart * 4;
+  if (xloop > 0) {
+#pragma omp parallel for
+    for (int i = 0; i < kloop; ++i) {
+      float* out_ptr = out + 4 * NBLOCK_C4 * i;
+      const float* in_ptr = in + i * 4 * n;
+      for (int j = 0; j < xloop; ++j) {
+        float* out_p = out_ptr + j * ldo;
+#ifdef __aarch64__
+        asm volatile(
+            "ld1 {v0.4s, v1.4s}, [%[in]],  #32  \n"
+            "ld1 {v2.4s, v3.4s}, [%[in]],  #32  \n"
+            "st1 {v0.4s, v1.4s}, [%[out]], #32  \n"
+            "ld1 {v4.4s, v5.4s}, [%[in]],  #32  \n"
+            "st1 {v2.4s, v3.4s}, [%[out]], #32  \n"
+            "ld1 {v6.4s, v7.4s}, [%[in]],  #32  \n"
+            "st1 {v4.4s, v5.4s}, [%[out]], #32  \n"
+            "st1 {v6.4s, v7.4s}, [%[out]], #32  \n"
+            : [in] "+r"(in_ptr), [out] "+r"(out_p)
+            :
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+#else
+        asm volatile(
+            "vld1.32 {d0-d3},   [%[in]]!  \n"
+            "vld1.32 {d4-d7},   [%[in]]!  \n"
+            "vst1.32 {d0-d3},   [%[out]]! \n"
+            "vld1.32 {d8-d11},  [%[in]]!  \n"
+            "vst1.32 {d4-d7},   [%[out]]! \n"
+            "vld1.32 {d12-d15}, [%[in]]!  \n"
+            "vst1.32 {d8-d11},  [%[out]]! \n"
+            "vst1.32 {d12-d15}, [%[out]]! \n"
+            : [in] "+r"(in_ptr), [out] "+r"(out_p)
+            :
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+#endif  // __aarch674__
+      }
+    }
+  }
+  float* out_remain4 = out + xloop * k_round * NBLOCK_C4;
+  const float* in_remain4 = in + xloop * NBLOCK_C4 * 4;
+  if (remain4) {
+#pragma omp parallel for
+    for (int i = 0; i < kloop; ++i) {
+      float* out_ptr = out_remain4 + 4 * 4 * i;
+      const float* in_ptr = in_remain4 + i * 4 * n;
+#ifdef __aarch64__
+      asm volatile(
+          "ld1 {v0.4s, v1.4s}, [%[in]], #32  \n"
+          "ld1 {v2.4s, v3.4s}, [%[in]], #32  \n"
+          "st1 {v0.4s, v1.4s}, [%[out]], #32 \n"
+          "st1 {v2.4s, v3.4s}, [%[out]], #32 \n"
+          : [in] "+r"(in_ptr), [out] "+r"(out_ptr)
+          :
+          : "v0", "v1", "v2", "v3");
+#else
+      asm volatile(
+          "vld1.32 {d0-d3}, [%[in]]!  \n"
+          "vld1.32 {d4-d7}, [%[in]]!  \n"
+          "vst1.32 {d0-d3}, [%[out]]! \n"
+          "vst1.32 {d4-d7}, [%[out]]! \n"
+          : [in] "+r"(in_ptr), [out] "+r"(out_ptr)
+          :
+          : "q0", "q1", "q2", "q3");
+#endif  // __aarch64__
+    }
+  }
+  float* out_remain1 = out_remain4 + remain4 * k_round * 4;
+  const float* in_remain1 = in_remain4 + remain4 * 4 * 4;
+  if (remain1) {
+#pragma omp parallel for
+    for (int i = 0; i < kloop; ++i) {
+      float* out_ptr = out_remain1 + 4 * remain1 * i;
+      const float* in_ptr = in_remain1 + i * 4 * n;
+      for (int j = 0; j < remain1; ++j) {
+        float32x4_t vin = vld1q_f32(in_ptr);
+        in_ptr += 4;
+        vst1q_f32(out_ptr, vin);
+        out_ptr += 4;
+      }
+    }
+  }
+}
+
+void sgemm_prepack_c4_common(int M,
+                             int N,
+                             int K,
+                             const float* A_packed,
+                             const float* B,
+                             float* C,
+                             const float* bias,
+                             bool has_bias,
+                             bool has_relu,
+                             ARMContext* ctx) {
+  const int m_round = (M + 3) / 4 * 4;
+  const int k_round = (K + 3) / 4 * 4;
+  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
+  int threads = ctx->threads();
+  auto workspace = ctx->workspace_data<float>();
+  // l2 = ablock * K * threads + K * bchunk_w + threads * ablock * bchunk_w;
+  int bchunk_w = (l2_cache - threads * k_round * sizeof(float)) /
+                 ((k_round + threads * MBLOCK_C4) * sizeof(float));
+  bchunk_w = bchunk_w > N ? N : bchunk_w;
+  bchunk_w = bchunk_w / NBLOCK_C4 * NBLOCK_C4;
+  bchunk_w = bchunk_w > NBLOCK_C4 ? bchunk_w : NBLOCK_C4;
+  int bchunk_loop = (N + bchunk_w - 1) / bchunk_w;
+
+  const int h_loop = m_round >> 2;  // MBLOCK_C4 == 4;
+  const int kcnt = (k_round + KBLOCK_C4 - 1) / KBLOCK_C4;
+  const int ldc = N * 4;
+  const int lda = k_round * 4;
+  float bias_buf[m_round];  // NOLINT
+  if (has_bias) {
+    memcpy(bias_buf, bias, M * sizeof(float));
+    memset(bias_buf + M, 0, (m_round - M) * sizeof(float));
+  } else {
+    memset(bias_buf, 0, m_round * sizeof(float));
+  }
+  // bchunk_loop
+  float* c = C;
+  for (int n = 0; n < bchunk_loop; ++n) {
+    int x_start = n * bchunk_w;
+    int x_end = x_start + bchunk_w;
+    int w_loop = bchunk_w / NBLOCK_C4;
+    int flag_remain = 0;
+    int w_loop4 = 0;
+    int remain = 0;
+    if (x_end > N) {
+      w_loop = (N - x_start) / NBLOCK_C4;
+      int w_loop_rem = (N - x_start) - w_loop * NBLOCK_C4;
+      w_loop4 = w_loop_rem >> 2;
+      remain = w_loop_rem & 3;
+      x_end = N;
+      flag_remain = 1;
+    }
+    float* bchunk = workspace;
+    loadb_c4(bchunk, B, x_start, x_end, k_round, N);
+    float* cchunk = c + n * bchunk_w * 4;
+    int has_remain = (n == bchunk_loop - 1) && flag_remain;
+#pragma omp parallel for num_threads(threads)
+    for (int h = 0; h < h_loop; ++h) {
+      float* bias_h = bias_buf + h * 4;
+#ifdef __aarch64__
+      float32x4_t vzero = vdupq_n_f32(0.f);
+      float32x4_t vbias = vld1q_f32(bias_h);
+#endif
+      const float* ablock = A_packed + h * lda;
+      const float* bblock = bchunk;
+      float* cblock = cchunk + h * ldc;
+      for (int w = 0; w < w_loop; ++w) {
+        int cnt = kcnt;
+        const float* ablock_ptr = ablock;
+// clang-format off
+#ifdef __aarch64__
+        asm volatile(
+            "prfm pldl1keep, [%[a]]         \n"
+            "prfm pldl1keep, [%[b]]         \n"
+            "prfm pldl1keep, [%[b], #64]    \n"
+            "mov  v9.16b,   %[vbias].16b    \n" /* mov bias to c0*/
+            "mov  v10.16b,  %[vbias].16b    \n" /* mov bias to c1*/
+            "mov  v11.16b,  %[vbias].16b    \n" /* mov bias to c2*/
+            "mov  v12.16b,  %[vbias].16b    \n" /* mov bias to c3*/
+            /* load a0a1 to v1-v2  */
+            "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+            "mov  v13.16b,  %[vbias].16b    \n" /* mov bias to c4*/
+            "mov  v14.16b,  %[vbias].16b    \n" /* mov bias to c5*/
+            "mov  v15.16b,  %[vbias].16b    \n" /* mov bias to c6*/
+            "mov  v16.16b,  %[vbias].16b    \n" /* mov bias to c7*/
+            "1:\n"
+            /* load b0b1b2b3 to v5-v8 */
+            "ld1   {v5.4s, v6.4s}, [%[b]], #32 \n"
+            "ld1   {v7.4s, v8.4s}, [%[b]], #32 \n"
+            "prfm  pldl1keep, [%[b]]        \n"
+            "fmla  v9.4s,  v1.4s, v5.s[0]   \n"
+            "fmla  v10.4s, v1.4s, v6.s[0]   \n"
+            "fmla  v11.4s, v1.4s, v7.s[0]   \n"
+            "fmla  v12.4s, v1.4s, v8.s[0]   \n"
+            /* load b4b5b6b7 to v25-v28 */
+            "ld1   {v25.4s, v26.4s}, [%[b]], #32 \n"
+            "ld1   {v27.4s, v28.4s}, [%[b]], #32 \n"
+            "prfm  pldl1keep, [%[a], #32]   \n"
+            "fmla  v9.4s,  v2.4s, v5.s[1]   \n"
+            "fmla  v10.4s, v2.4s, v6.s[1]   \n"
+            "fmla  v11.4s, v2.4s, v7.s[1]   \n"
+            "fmla  v12.4s, v2.4s, v8.s[1]   \n"
+            "prfm  pldl1keep, [%[b], #64]   \n"
+            "fmla  v13.4s, v1.4s, v25.s[0]  \n"
+            "fmla  v14.4s, v1.4s, v26.s[0]  \n"
+            "fmla  v15.4s, v1.4s, v27.s[0]  \n"
+            "fmla  v16.4s, v1.4s, v28.s[0]  \n"
+            /* load a2a3 to v3-v4 */
+            "ld1   {v3.4s, v4.4s},  [%[a]], #32 \n"
+            "prfm  pldl1keep, [%[b], #128]  \n"
+            "fmla  v13.4s, v2.4s, v25.s[1]  \n"
+            "fmla  v14.4s, v2.4s, v26.s[1]  \n"
+            "fmla  v15.4s, v2.4s, v27.s[1]  \n"
+            "fmla  v16.4s, v2.4s, v28.s[1]  \n"
+            "subs  %w[cnt], %w[cnt], #1     \n"
+            "fmla  v9.4s,  v3.4s, v5.s[2]   \n"
+            "fmla  v10.4s, v3.4s, v6.s[2]   \n"
+            "fmla  v11.4s, v3.4s, v7.s[2]   \n"
+            "fmla  v12.4s, v3.4s, v8.s[2]   \n"
+            "fmla  v13.4s, v3.4s, v25.s[2]  \n"
+            "fmla  v14.4s, v3.4s, v26.s[2]  \n"
+            "fmla  v15.4s, v3.4s, v27.s[2]  \n"
+            "fmla  v16.4s, v3.4s, v28.s[2]  \n"
+            /* load a0a1 to v1-v2 */
+            "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+            "fmla  v9.4s,  v4.4s, v5.s[3]   \n"
+            "fmla  v10.4s, v4.4s, v6.s[3]   \n"
+            "fmla  v11.4s, v4.4s, v7.s[3]   \n"
+            "fmla  v12.4s, v4.4s, v8.s[3]   \n"
+
+            "fmla  v13.4s, v4.4s, v25.s[3]  \n"
+            "fmla  v14.4s, v4.4s, v26.s[3]  \n"
+            "fmla  v15.4s, v4.4s, v27.s[3]  \n"
+            "fmla  v16.4s, v4.4s, v28.s[3]  \n"
+            "bne   1b\n"
+            "cbz   %w[relu], 2f             \n"
+            "fmax  v9.4s,  v9.4s,  %[vzero].4s  \n"
+            "fmax  v10.4s, v10.4s, %[vzero].4s  \n"
+            "fmax  v11.4s, v11.4s, %[vzero].4s  \n"
+            "fmax  v12.4s, v12.4s, %[vzero].4s  \n"
+            "fmax  v13.4s, v13.4s, %[vzero].4s  \n"
+            "fmax  v14.4s, v14.4s, %[vzero].4s  \n"
+            "fmax  v15.4s, v15.4s, %[vzero].4s  \n"
+            "fmax  v16.4s, v16.4s, %[vzero].4s  \n"
+            "2:\n"
+            "st1   {v9.4s,  v10.4s, v11.4s, v12.4s}, [%[c]], #64  \n"
+            "st1   {v13.4s, v14.4s, v15.4s, v16.4s}, [%[c]], #64  \n"
+            : [a] "+r"(ablock_ptr),
+              [b] "+r"(bblock),
+              [c] "+r"(cblock),
+              [cnt] "+r"(cnt)
+            : [bias] "r"(bias_h), [relu] "r"(has_relu), 
+              [vbias] "w"(vbias), [vzero] "w" (vzero) 
+            : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", 
+              "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", 
+              "v25", "v26", "v27", "v28", "cc", "memory");
+#else
+        asm volatile(
+            "vld1.32  {d6-d7}, [%[bias]] \n"
+            "pld [%[a]]  \n"
+            "pld [%[b]]  \n"
+            "pld [%[b], #64]  \n"
+            "vmov.32  q8,   q3   \n" /* mov bias to c0*/
+            "vmov.32  q9,   q3   \n" /* mov bias to c1*/
+            "vmov.32  q10,  q3   \n" /* mov bias to c2*/
+            "vmov.32  q11,  q3   \n" /* mov bias to c3*/
+            "vld1.32   {d0-d3}, [%[a]]! \n"
+            "vmov.32  q12,  q3   \n" /* mov bias to c4*/
+            "vmov.32  q13,  q3   \n" /* mov bias to c5*/
+            "vmov.32  q14,  q3   \n" /* mov bias to c6*/
+            "vmov.32  q15,  q3   \n" /* mov bias to c7*/
+            "1:\n"
+            /* c0c1c2c3 */
+            "vld1.32   {d8-d11},  [%[b]]! \n"
+            "vld1.32   {d12-d15}, [%[b]]! \n"
+            "pld  [%[b]]                  \n"
+            "vmla.f32  q8,  q0, d8[0]     \n"
+            "vmla.f32  q9,  q0, d10[0]    \n"
+            "vmla.f32  q10, q0, d12[0]    \n"
+            "vmla.f32  q11, q0, d14[0]    \n"
+            "vld1.32   {d4-d7}, [%[a]]!   \n"
+            "vmla.f32  q8,  q1, d8[1]     \n"
+            "vmla.f32  q9,  q1, d10[1]    \n"
+            "vmla.f32  q10, q1, d12[1]    \n"
+            "vmla.f32  q11, q1, d14[1]    \n"
+            "pld [%[b], #64]              \n"
+            "vmla.f32  q8,  q2, d9[0]     \n"
+            "vmla.f32  q9,  q2, d11[0]    \n"
+            "vmla.f32  q10, q2, d13[0]    \n"
+            "vmla.f32  q11, q2, d15[0]    \n"
+            "subs  %[cnt], %[cnt], #1     \n"
+            "vmla.f32  q8,  q3, d9[1]     \n"
+            "vmla.f32  q9,  q3, d11[1]    \n"
+            "vld1.f32  {d8-d11}, [%[b]]!  \n"
+            "vmla.f32  q10, q3, d13[1]    \n"
+            "vmla.f32  q11, q3, d15[1]    \n"
+            "vld1.32   {d12-d15}, [%[b]]! \n"
+            /* c4c5c6c7 */
+            "vmla.f32  q12, q0, d8[0]     \n"
+            "vmla.f32  q13, q0, d10[0]    \n"
+            "vmla.f32  q14, q0, d12[0]    \n"
+            "vmla.f32  q15, q0, d14[0]    \n"
+            "pld  [%[a], #32]             \n"
+            "vmla.f32  q12, q1, d8[1]     \n"
+            "vmla.f32  q13, q1, d10[1]    \n"
+            "vmla.f32  q14, q1, d12[1]    \n"
+            "vmla.f32  q15, q1, d14[1]    \n"
+            "vld1.32   {d0-d3}, [%[a]]!   \n"
+            "vmla.f32  q12, q2, d9[0]     \n"
+            "vmla.f32  q13, q2, d11[0]    \n"
+            "vmla.f32  q14, q2, d13[0]    \n"
+            "vmla.f32  q15, q2, d15[0]    \n"
+            "pld [%[b], #64]              \n"
+            "vmla.f32  q12, q3, d9[1]     \n"
+            "vmla.f32  q13, q3, d11[1]    \n"
+            "vmla.f32  q14, q3, d13[1]    \n"
+            "vmla.f32  q15, q3, d15[1]    \n"
+            "bne   1b\n"
+            "cmp   %[relu], #0            \n"
+            "beq   2f                     \n"
+            "vmov.u32 q0, #0              \n"
+            "vmax.f32  q8,   q8,   q0     \n"
+            "vmax.f32  q9,   q9,   q0     \n"
+            "vmax.f32  q10,  q10,  q0     \n"
+            "vmax.f32  q11,  q11,  q0     \n"
+            "vmax.f32  q12,  q12,  q0     \n"
+            "vmax.f32  q13,  q13,  q0     \n"
+            "vmax.f32  q14,  q14,  q0     \n"
+            "vmax.f32  q15,  q15,  q0     \n"
+            "2:\n"
+            "vst1.32   {d16-d19}, [%[c]]! \n"
+            "vst1.32   {d20-d23}, [%[c]]! \n"
+            "vst1.32   {d24-d27}, [%[c]]! \n"
+            "vst1.32   {d28-d31}, [%[c]]! \n"
+            : [a] "+r"(ablock_ptr),
+              [b] "+r"(bblock),
+              [c] "+r"(cblock),
+              [cnt] "+r"(cnt)
+            : [bias] "r"(bias_h), 
+              [relu] "r"(has_relu)
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+              "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory");
+#endif
+        // clang-format on
+      }
+      if (has_remain) {
+        if (w_loop4 > 0) {
+          int cnt = kcnt;
+          const float* ablock_ptr = ablock;
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+              "prfm pldl1keep, [%[a]]         \n"
+              "prfm pldl1keep, [%[b]]         \n"
+              "mov  v9.16b,   %[vbias].16b    \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vbias].16b    \n" /* mov bias to c1*/
+              "mov  v11.16b,  %[vbias].16b    \n" /* mov bias to c2*/
+              "mov  v12.16b,  %[vbias].16b    \n" /* mov bias to c3*/
+              /* load a0a1 to v1-v2 */
+              "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+              "1:\n"
+              /* load b0b1b2b3 to v5-v8 */
+              "ld1   {v5.4s, v6.4s}, [%[b]], #32 \n"
+              "ld1   {v7.4s, v8.4s}, [%[b]], #32 \n"
+              "fmla  v9.4s,  v1.4s, v5.s[0]   \n"
+              "fmla  v10.4s, v1.4s, v6.s[0]   \n"
+              "fmla  v11.4s, v1.4s, v7.s[0]   \n"
+              "fmla  v12.4s, v1.4s, v8.s[0]   \n"
+              /* load a2a3 to v3-v4 */
+              "ld1   {v3.4s, v4.4s},  [%[a]], #32 \n"
+              "prfm  pldl1keep, [%[a]]        \n"
+              "fmla  v9.4s,  v2.4s, v5.s[1]   \n"
+              "fmla  v10.4s, v2.4s, v6.s[1]   \n"
+              "fmla  v11.4s, v2.4s, v7.s[1]   \n"
+              "fmla  v12.4s, v2.4s, v8.s[1]   \n"
+              "prfm  pldl1keep, [%[b]]        \n"
+              "subs  %w[cnt], %w[cnt], #1     \n"
+              "fmla  v9.4s,  v3.4s, v5.s[2]   \n"
+              "fmla  v10.4s, v3.4s, v6.s[2]   \n"
+              "fmla  v11.4s, v3.4s, v7.s[2]   \n"
+              "fmla  v12.4s, v3.4s, v8.s[2]   \n"
+              /* load a0a1 to v1-v2 */
+              "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,  v4.4s, v5.s[3]   \n"
+              "fmla  v10.4s, v4.4s, v6.s[3]   \n"
+              "fmla  v11.4s, v4.4s, v7.s[3]   \n"
+              "fmla  v12.4s, v4.4s, v8.s[3]   \n"
+              "bne   1b\n"
+              "cbz   %w[relu], 2f             \n"
+              "fmax  v9.4s,  v9.4s,  %[vzero].4s  \n"
+              "fmax  v10.4s, v10.4s, %[vzero].4s  \n"
+              "fmax  v11.4s, v11.4s, %[vzero].4s  \n"
+              "fmax  v12.4s, v12.4s, %[vzero].4s  \n"
+              "2:\n"
+              "st1   {v9.4s,  v10.4s, v11.4s, v12.4s}, [%[c]], #64  \n"
+              : [a] "+r"(ablock_ptr),
+                [b] "+r"(bblock),
+                [c] "+r"(cblock),
+                [cnt] "+r"(cnt)
+              : [bias] "r"(bias_h),
+                [relu] "r"(has_relu),
+                [vbias] "w"(vbias), 
+                [vzero] "w" (vzero)   
+              : "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                "v8", "v9", "v10", "v11", "v12", "cc", "memory");
+#else
+          asm volatile(
+            "pld [%[a]]  \n"
+            "pld [%[b]]  \n"
+            "vld1.32  {d6-d7}, [%[bias]] \n"
+            "vld1.32  {d0-d3}, [%[a]]!   \n" /* load a0 a1 */
+            "vmov.32  q8,   q3   \n"     /* mov bias to c0 */
+            "vmov.32  q9,   q3   \n"     /* mov bias to c1 */
+            "vmov.32  q10,  q3   \n"     /* mov bias to c2 */
+            "vmov.32  q11,  q3   \n"     /* mov bias to c3 */
+            "1:\n"
+            /* c0c1c2c3 */
+            "vld1.32   {d8-d11},  [%[b]]! \n"
+            "vld1.32   {d12-d15}, [%[b]]! \n"
+            "pld  [%[b]]                  \n"
+            "vmla.f32  q8,  q0, d8[0]     \n"
+            "vmla.f32  q9,  q0, d10[0]    \n"
+            "vmla.f32  q10, q0, d12[0]    \n"
+            "vmla.f32  q11, q0, d14[0]    \n"
+            "vld1.32   {d4-d7}, [%[a]]!   \n"
+            "pld [%[a]]                   \n"
+            "vmla.f32  q8,  q1, d8[1]     \n"
+            "vmla.f32  q9,  q1, d10[1]    \n"
+            "vmla.f32  q10, q1, d12[1]    \n"
+            "vmla.f32  q11, q1, d14[1]    \n"
+            "subs  %[cnt], %[cnt], #1     \n"
+            "vmla.f32  q8,  q2, d9[0]     \n"
+            "vmla.f32  q9,  q2, d11[0]    \n"
+            "vmla.f32  q10, q2, d13[0]    \n"
+            "vmla.f32  q11, q2, d15[0]    \n"
+            "vld1.32   {d0-d3}, [%[a]]!   \n"
+            "vmla.f32  q8,  q3, d9[1]     \n"
+            "vmla.f32  q9,  q3, d11[1]    \n"
+            "vmla.f32  q10, q3, d13[1]    \n"
+            "vmla.f32  q11, q3, d15[1]    \n"
+            "bne   1b\n"
+            "cmp   %[relu], #0            \n"
+            "beq   2f                     \n"
+            "vmov.u32  q0, #0             \n"
+            "vmax.f32  q8,   q8,   q0     \n"
+            "vmax.f32  q9,   q9,   q0     \n"
+            "vmax.f32  q10,  q10,  q0     \n"
+            "vmax.f32  q11,  q11,  q0     \n"
+            "2:\n"
+            "vst1.32   {d16-d19}, [%[c]]! \n"
+            "vst1.32   {d20-d23}, [%[c]]! \n"
+            : [a] "+r"(ablock_ptr),
+              [b] "+r"(bblock),
+              [c] "+r"(cblock),
+              [cnt] "+r"(cnt)
+            : [bias] "r"(bias_h), [relu] "r"(has_relu)
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+              "q9", "q10", "q11", "cc", "memory");
+#endif
+          // clang-format on
+        }
+        if (remain > 0) {
+          int cnt = kcnt;
+          const float* ablock_ptr = ablock;
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+              "prfm pldl1keep, [%[a]]   \n"
+              "prfm pldl1keep, [%[b]]   \n"
+              "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+              "cmp  %w[remain], #3      \n"
+              "beq  1f                  \n"
+              "cmp  %w[remain], #2      \n"
+              "beq  2f                  \n"
+              /* remain 1 */
+              "mov  v9.16b,   %[vbias].16b  \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vzero].16b  \n" /* mov zero to c1*/
+              "3:                                 \n"
+              "ld1   {v5.4s}, [%[b]], #16         \n"
+              "ld1   {v3.4s,  v4.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v1.4s,  v5.s[0]     \n"
+              "fmla  v10.4s,  v2.4s,  v5.s[1]     \n"
+              "subs  %w[cnt], %w[cnt], #1         \n"
+              "ld1   {v1.4s,  v2.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v3.4s,  v5.s[2]     \n"
+              "fmla  v10.4s,  v4.4s,  v5.s[3]     \n"
+              "bne   3b                           \n"
+              "fadd  v9.4s,   v9.4s,  v10.4s      \n"
+              "cbz   %w[relu], 6f                 \n"
+              "fmax  v9.4s,   v9.4s,  %[vzero].4s \n"
+              "6:                                 \n"
+              "st1   {v9.4s}, [%[c]], #16         \n"
+              "b     9f                           \n"
+              /* remain 2 */
+              "2:                           \n"
+              "mov  v9.16b,   %[vbias].16b  \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vbias].16b  \n" /* mov bias to c1*/
+              "mov  v11.16b,  %[vzero].16b  \n" /* mov zero to c2*/
+              "mov  v12.16b,  %[vzero].16b  \n" /* mov zero to c3*/
+              "4:                                 \n"
+              "ld1   {v5.4s,  v6.4s}, [%[b]], #32 \n"
+              "ld1   {v3.4s,  v4.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v1.4s,  v5.s[0]     \n"
+              "fmla  v10.4s,  v1.4s,  v6.s[0]     \n"
+              "fmla  v11.4s,  v2.4s,  v5.s[1]     \n"
+              "fmla  v12.4s,  v2.4s,  v6.s[1]     \n"
+              "subs  %w[cnt], %w[cnt], #1         \n"
+              "fmla  v9.4s,   v3.4s,  v5.s[2]     \n"
+              "fmla  v10.4s,  v3.4s,  v6.s[2]     \n"
+              "fmla  v11.4s,  v4.4s,  v5.s[3]     \n"
+              "fmla  v12.4s,  v4.4s,  v6.s[3]     \n"
+              "ld1   {v1.4s,  v2.4s}, [%[a]], #32 \n"
+              "bne   4b                           \n"
+              "fadd  v9.4s,   v9.4s,  v11.4s      \n"
+              "fadd  v10.4s,  v10.4s, v12.4s      \n"
+              "cbz   %w[relu], 7f                 \n"
+              "fmax  v9.4s,   v9.4s,  %[vzero].4s \n"
+              "fmax  v10.4s,  v10.4s, %[vzero].4s \n"
+              "7:                                 \n"
+              "st1   {v9.4s, v10.4s}, [%[c]], #32 \n"
+              "b     9f                           \n"
+              /* remain 3 */
+              "1:                       \n"
+              "mov  v9.16b,   %[vbias].16b  \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vbias].16b  \n" /* mov bias to c1*/
+              "mov  v11.16b,  %[vbias].16b  \n" /* mov bias to c2*/
+              "5:                                 \n"
+              "ld1   {v5.4s,  v6.4s}, [%[b]], #32 \n"
+              "ld1   {v7.4s}, [%[b]], #16         \n"
+              "fmla  v9.4s,   v1.4s,  v5.s[0]     \n"
+              "fmla  v10.4s,  v1.4s,  v6.s[0]     \n"
+              "fmla  v11.4s,  v1.4s,  v7.s[0]     \n"
+              "ld1   {v3.4s,  v4.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v2.4s,  v5.s[1]     \n"
+              "fmla  v10.4s,  v2.4s,  v6.s[1]     \n"
+              "fmla  v11.4s,  v2.4s,  v7.s[1]     \n"
+              "subs  %w[cnt], %w[cnt], #1         \n"
+              "fmla  v9.4s,   v3.4s,  v5.s[2]     \n"
+              "fmla  v10.4s,  v3.4s,  v6.s[2]     \n"
+              "fmla  v11.4s,  v3.4s,  v7.s[2]     \n"
+              "prfm  pldl1keep, [%[a]]            \n"
+              "fmla  v9.4s,   v4.4s,  v5.s[3]     \n"
+              "fmla  v10.4s,  v4.4s,  v6.s[3]     \n"
+              "fmla  v11.4s,  v4.4s,  v7.s[3]     \n"
+              "ld1   {v1.4s,  v2.4s}, [%[a]], #32 \n"
+              "bne   5b                           \n"
+              "cbz   %w[relu], 8f                 \n"
+              "fmax  v9.4s,   v9.4s,  %[vzero].4s \n"
+              "fmax  v10.4s,  v10.4s, %[vzero].4s \n"
+              "fmax  v11.4s,  v11.4s, %[vzero].4s \n"
+              "8:                                 \n"
+              "st1   {v9.4s, v10.4s}, [%[c]], #32 \n"
+              "st1   {v11.4s}, [%[c]], #16        \n"
+              "9:\n"
+              : [a] "+r"(ablock_ptr),
+                [b] "+r"(bblock),
+                [c] "+r"(cblock),
+                [cnt] "+r"(cnt)
+              : [bias] "r"(bias_h), [relu] "r"(has_relu), 
+                [remain] "r"(remain), [vbias] "w"(vbias), 
+                [vzero] "w" (vzero) 
+              : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9", 
+                "v10", "v11", "v12", "cc","memory");
+#else
+          asm volatile(
+              "pld  [%[a]]  \n"
+              "pld  [%[b]]  \n"
+              "vld1.32  {d0-d1}, [%[bias]]  \n"
+              "vld1.32  {d2-d5}, [%[a]]!    \n"
+              "vmov.u32 q15,  #0            \n"
+              "cmp  %[remain], #3           \n"
+              "beq  1f                      \n"
+              "cmp  %[remain], #2           \n"
+              "beq  2f                      \n"
+              /* remain 1 */
+              "vmov.32  q9,   q0  \n" /* mov bias to c0*/
+              "vmov.32  q10,  q15 \n" /* mov zero to c1*/
+              "3:                             \n"
+              "vld1.32   {d10-d11}, [%[b]]!   \n"
+              "vld1.32   {d6-d9},   [%[a]]!   \n"
+              "vmla.f32  q9,  q1,  d10[0]     \n"
+              "vmla.f32  q10, q2,  d10[1]     \n"
+              "subs   %[cnt],  %[cnt], #1     \n"
+              "vld1.32   {d2-d5},   [%[a]]!   \n"
+              "vmla.f32  q9,  q3,  d11[0]     \n"
+              "vmla.f32  q10, q4,  d11[1]     \n"
+              "bne   3b                       \n"
+              "vadd.f32  q9,  q9,  q10        \n"
+              "cmp  %[relu],  #0              \n"
+              "beq  6f                        \n"
+              "vmax.f32  q9,  q9,  q15        \n"
+              "6:                             \n"
+              "vst1.32   {d18-d19}, [%[c]]!   \n"
+              "b     9f                       \n"
+              /* remain 2 */
+              "2:                             \n"
+              "vmov.u32  q9,  q0    \n" /* mov bias to c0*/
+              "vmov.u32  q10, q0    \n" /* mov bias to c1*/
+              "vmov.u32  q11, q15   \n" /* mov zero to c2*/
+              "vmov.u32  q12, q15   \n" /* mov zero to c3*/
+              "4:                             \n"
+              "vld1.32   {d10-d13}, [%[b]]!   \n"
+              "vld1.32   {d6-d9},   [%[a]]!   \n"
+              "vmla.f32  q9,   q1,  d10[0]    \n"
+              "vmla.f32  q10,  q1,  d12[0]    \n"
+              "vmla.f32  q11,  q2,  d10[1]    \n"
+              "vmla.f32  q12,  q2,  d12[1]    \n"
+              "subs  %[cnt],  %[cnt], #1      \n"
+              "vmla.f32  q9,   q3,  d11[0]    \n"
+              "vmla.f32  q10,  q3,  d13[0]    \n"
+              "vmla.f32  q11,  q4,  d11[1]    \n"
+              "vmla.f32  q12,  q4,  d13[1]    \n"
+              "vld1.32   {d2-d5},   [%[a]]!   \n"
+              "bne   4b                       \n"
+              "vadd.f32  q9,   q9,  q11       \n"
+              "vadd.f32  q10,  q10, q12       \n"
+              "cmp  %[relu],  #0              \n"
+              "beq  7f                        \n"
+              "vmax.f32  q9,   q9,  q15       \n"
+              "vmax.f32  q10,  q10, q15       \n"
+              "7:                             \n"
+              "vst1.32   {d18-d21}, [%[c]]!   \n"
+              "b     9f                       \n"
+              /* remain 3 */
+              "1:                             \n"
+              "vmov.u32  q9,   q0    \n" /* mov bias to c0*/
+              "vmov.u32  q10,  q0    \n" /* mov bias to c1*/
+              "vmov.u32  q11,  q0    \n" /* mov bias to c2*/
+              "5:                             \n"
+              "vld1.32   {d10-d13}, [%[b]]!   \n"
+              "vld1.32   {d14-d15}, [%[b]]!   \n"
+              "vmla.f32  q9,  q1,   d10[0]    \n"
+              "vmla.f32  q10, q1,   d12[0]    \n"
+              "vmla.f32  q11, q1,   d14[0]    \n"
+              "vld1.32   {d6-d9},   [%[a]]!   \n"
+              "vmla.f32  q9,  q2,  d10[1]     \n"
+              "vmla.f32  q10, q2,  d12[1]     \n"
+              "vmla.f32  q11, q2,  d14[1]     \n"
+              "subs  %[cnt],  %[cnt], #1      \n"
+              "vmla.f32  q9,  q3,  d11[0]     \n"
+              "vmla.f32  q10, q3,  d13[0]     \n"
+              "vmla.f32  q11, q3,  d15[0]     \n"
+              "pld       [%[a]]               \n"
+              "vmla.f32  q9,  q4,  d11[1]     \n"
+              "vmla.f32  q10, q4,  d13[1]     \n"
+              "vmla.f32  q11, q4,  d15[1]     \n"
+              "vld1.32   {d2-d5},  [%[a]]!    \n"
+              "bne   5b                       \n"
+              "cmp  %[relu],  #0              \n"
+              "beq  8f                        \n"
+              "vmax.f32  q9,  q9,  q15        \n"
+              "vmax.f32  q10, q10, q15        \n"
+              "vmax.f32  q11, q11, q15        \n"
+              "8:                             \n"
+              "vst1.32   {d18-d21}, [%[c]]!   \n"
+              "vst1.32   {d22-d23}, [%[c]]!   \n"
+              "9:\n"
+              : [a] "+r"(ablock_ptr),
+                [b] "+r"(bblock),
+                [c] "+r"(cblock),
+                [cnt] "+r"(cnt)
+              : [bias] "r"(bias_h), 
+                [relu] "r"(has_relu), 
+                [remain] "r"(remain)
+              : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q9", 
+                "q10", "q11", "q12", "q15", "cc","memory");
+#endif
+          // clang-format on
+        }
+      }
+    }
+  }
+}
+void sgemm_prepack_c4_small(int M,
+                            int N,
+                            int K,
+                            const float* A_packed,
+                            const float* B,
+                            float* C,
+                            const float* bias,
+                            bool has_bias,
+                            bool has_relu,
+                            ARMContext* ctx) {
+  const int m_round = (M + 3) / 4 * 4;
+  const int k_round = (K + 3) / 4 * 4;
+  const int mloop = m_round >> 2;
+  const int lda = 4 * k_round;
+  const int ldb_byte = 4 * N * sizeof(float);
+  const int kcnt = k_round >> 2;
+  float bias_buf[m_round];  // NOLINT
+  if (has_bias) {
+    memcpy(bias_buf, bias, M * sizeof(float));
+    memset(bias_buf + M, 0, (m_round - M) * sizeof(float));
+  } else {
+    memset(bias_buf, 0, m_round * sizeof(float));
+  }
+#ifdef __aarch64__
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#endif
+  const float* bias_ptr = bias_buf;
+  for (int m = 0; m < mloop; ++m) {
+#ifdef __aarch64__
+    float32x4_t vbias = vld1q_f32(bias_ptr);
+#endif
+    const float* b = B;
+    int n = N;
+#ifdef __aarch64__
+    for (; n > 7; n -= 8) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* mov bias to c0-c7*/
+        "mov  v8.16b,   %[vbias].16b \n"
+        "mov  v9.16b,   %[vbias].16b \n"
+        "mov  v10.16b,  %[vbias].16b \n"
+        "mov  v11.16b,  %[vbias].16b \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        "mov  v12.16b,  %[vbias].16b \n"
+        "mov  v13.16b,  %[vbias].16b \n"
+        "mov  v14.16b,  %[vbias].16b \n"
+        "mov  v15.16b,  %[vbias].16b \n"
+        "1:\n"
+        /* load b2, b3 */
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v16.4s, v1.s[0] \n"
+        "fmla v10.4s, v16.4s, v2.s[0] \n"
+        "fmla v11.4s, v16.4s, v3.s[0] \n"
+        "prfm pldl1keep, [%[b]]       \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        /* load b4, b5 */
+        "ld1  {v4.4s, v5.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load b6, b7 */
+        "ld1  {v6.4s, v7.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "sub  %[b],   %[b],   #128    \n"
+        "fmla v12.4s, v16.4s, v4.s[0] \n"
+        "fmla v13.4s, v16.4s, v5.s[0] \n"
+        "fmla v14.4s, v16.4s, v6.s[0] \n"
+        "fmla v15.4s, v16.4s, v7.s[0] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v12.4s, v17.4s, v4.s[1] \n"
+        "fmla v13.4s, v17.4s, v5.s[1] \n"
+        "fmla v14.4s, v17.4s, v6.s[1] \n"
+        "fmla v15.4s, v17.4s, v7.s[1] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v12.4s, v18.4s, v4.s[2] \n"
+        "fmla v13.4s, v18.4s, v5.s[2] \n"
+        "fmla v14.4s, v18.4s, v6.s[2] \n"
+        "fmla v15.4s, v18.4s, v7.s[2] \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        "fmla v12.4s, v19.4s, v4.s[3] \n"
+        "fmla v13.4s, v19.4s, v5.s[3] \n"
+        "fmla v14.4s, v19.4s, v6.s[3] \n"
+        "fmla v15.4s, v19.4s, v7.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "bne  1b                      \n"
+        "cbz  %w[relu], 2f            \n"
+        "fmax v8.4s,  v8.4s,  %[vzero].4s \n"
+        "fmax v9.4s,  v9.4s,  %[vzero].4s \n"
+        "fmax v10.4s, v10.4s, %[vzero].4s \n"
+        "fmax v11.4s, v11.4s, %[vzero].4s \n"
+        "fmax v12.4s, v12.4s, %[vzero].4s \n"
+        "fmax v13.4s, v13.4s, %[vzero].4s \n"
+        "fmax v14.4s, v14.4s, %[vzero].4s \n"
+        "fmax v15.4s, v15.4s, %[vzero].4s \n"
+        "2:\n"
+        "st1  {v8.4s,  v9.4s,  v10.4s, v11.4s}, [%[c]], #64 \n"
+        "st1  {v12.4s, v13.4s, v14.4s, v15.4s}, [%[c]], #64 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [vbias] "w" (vbias),
+          [vzero] "w" (vzero)
+        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+          "v19", "cc", "memory"
+      );
+      b += 4 * 8;
+    }
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* mov bias to c0-c3*/
+        "mov  v8.16b,   %[vbias].16b \n"
+        "mov  v9.16b,   %[vbias].16b \n"
+        "mov  v10.16b,  %[vbias].16b \n"
+        "mov  v11.16b,  %[vbias].16b \n"
+        "1:\n"
+        /* load b0-b3 */
+        "ld1  {v0.4s,  v1.4s},  [%[b]], #32 \n"
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v16.4s, v1.s[0] \n"
+        "fmla v10.4s, v16.4s, v2.s[0] \n"
+        "fmla v11.4s, v16.4s, v3.s[0] \n"
+        "sub  %[b],   %[b],   #64     \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "bne  1b                      \n"
+        "cbz  %w[relu], 2f            \n"
+        "fmax v8.4s,  v8.4s,  %[vzero].4s \n"
+        "fmax v9.4s,  v9.4s,  %[vzero].4s \n"
+        "fmax v10.4s, v10.4s, %[vzero].4s \n"
+        "fmax v11.4s, v11.4s, %[vzero].4s \n"
+        "2:\n"
+        "st1  {v8.4s,  v9.4s,  v10.4s, v11.4s}, [%[c]], #64 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [vbias] "w" (vbias),
+          [vzero] "w" (vzero)
+        : "v0", "v1", "v2", "v3", "v8", "v9",
+          "v10", "v11", "v16", "v17", "v18",
+          "v19", "cc", "memory"
+      );
+      b += 4 * 4;
+    }
+    for (; n > 0; n--) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* mov bias to c0 */
+        "mov  v8.16b,   %[vbias].16b \n"
+        "mov  v9.16b,   %[vzero].16b \n"
+        "1:\n"
+        /* load b0 */
+        "ld1  {v0.4s},  [%[b]], #16  \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v17.4s, v0.s[1] \n"
+        "sub  %[b],   %[b],   #16     \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v19.4s, v0.s[3] \n"
+         /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "bne  1b                      \n"
+        "fadd v8.4s,  v8.4s,  v9.4s   \n"
+        "cbz  %w[relu], 2f            \n"
+        "fmax v8.4s,  v8.4s,  %[vzero].4s \n"
+        "2:\n"
+        "st1  {v8.4s}, [%[c]], #16    \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [vbias] "w" (vbias),
+          [vzero] "w" (vzero)
+        : "v0", "v8", "v9", "v16", "v17", 
+          "v18", "v19", "cc", "memory"
+      );
+      b += 4;
+    }
+#else
+    for (; n > 7; n -= 8) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        "vld1.32  {d6-d7}, [%[bias]] \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]!  \n"
+        /* mov bias to c0-c7*/
+        "vmov.u32   q8,    q3 \n"
+        "vmov.u32   q9,    q3 \n"
+        "vmov.u32   q10,   q3 \n"
+        "vmov.u32   q11,   q3 \n"
+        /* load b0, b1 */
+        "vld1.32  {d0-d3}, [%[b]]! \n"
+        "vmov.u32   q12,   q3 \n"
+        "vmov.u32   q13,   q3 \n"
+        "vmov.u32   q14,   q3 \n"
+        "vmov.u32   q15,   q3 \n"
+        "1:\n"
+        /* load b2, b3 */
+        "vld1.32    {d4-d7},   [%[b]]! \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15},   [%[a]]! \n"
+        "vmla.f32   q8,   q4,   d0[0]  \n"
+        "vmla.f32   q9,   q4,   d2[0]  \n"
+        "vmla.f32   q10,  q4,   d4[0]  \n"
+        "vmla.f32   q11,  q4,   d6[0]  \n"
+        "pld    [%[b]]                 \n"
+        "vmla.f32   q8,   q5,   d0[1]  \n"
+        "vmla.f32   q9,   q5,   d2[1]  \n"
+        "vmla.f32   q10,  q5,   d4[1]  \n"
+        "vmla.f32   q11,  q5,   d6[1]  \n"
+        "subs   %[cnt],   %[cnt],  #1  \n"
+        "vmla.f32   q8,   q6,   d1[0]  \n"
+        "vmla.f32   q9,   q6,   d3[0]  \n"
+        "vmla.f32   q10,  q6,   d5[0]  \n"
+        "vmla.f32   q11,  q6,   d7[0]  \n"
+        "pld    [%[b], #64]            \n"
+        "vmla.f32   q8,   q7,   d1[1]  \n"
+        "vmla.f32   q9,   q7,   d3[1]  \n"
+        /* load b4, b5 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q10,  q7,   d5[1]  \n"
+        "vmla.f32   q11,  q7,   d7[1]  \n"
+        /* load b6, b7 */
+        "vld1.32    {d4-d7},  [%[b]]!  \n"
+        "vmla.f32   q12,  q4,   d0[0]  \n"
+        "vmla.f32   q13,  q4,   d2[0]  \n"
+        "vmla.f32   q14,  q4,   d4[0]  \n"
+        "vmla.f32   q15,  q4,   d6[0]  \n"
+        "sub  %[b],   %[b],   #128     \n"
+        "vmla.f32   q12,  q5,   d0[1]  \n"
+        "vmla.f32   q13,  q5,   d2[1]  \n"
+        "vmla.f32   q14,  q5,   d4[1]  \n"
+        "vmla.f32   q15,  q5,   d6[1]  \n"
+        "add  %[b],   %[b],   %[ldb]   \n"
+        "vmla.f32   q12,  q6,   d1[0]  \n"
+        "vmla.f32   q13,  q6,   d3[0]  \n"
+        "vmla.f32   q14,  q6,   d5[0]  \n"
+        "vmla.f32   q15,  q6,   d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32    {d8-d11}, [%[a]]!  \n"
+        "vmla.f32   q12,  q7,   d1[1]  \n"
+        "vmla.f32   q13,  q7,   d3[1]  \n"
+        /* load b0, b1 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q14,  q7,   d5[1]  \n"
+        "vmla.f32   q15,  q7,   d7[1]  \n"
+        "bne  1b                       \n"
+        "cmp  %[relu],  #0             \n"
+        "beq  2f                       \n"
+        "vmov.u32   q0,   #0           \n"
+        "vmax.f32   q8,   q8,   q0     \n"
+        "vmax.f32   q9,   q9,   q0     \n"
+        "vmax.f32   q10,  q10,  q0     \n"
+        "vmax.f32   q11,  q11,  q0     \n"
+        "vmax.f32   q12,  q12,  q0     \n"
+        "vmax.f32   q13,  q13,  q0     \n"
+        "vmax.f32   q14,  q14,  q0     \n"
+        "vmax.f32   q15,  q15,  q0     \n"
+        "2:\n"
+        "vst1.32  {d16-d19}, [%[c]]!   \n"
+        "vst1.32  {d20-d23}, [%[c]]!   \n"
+        "vst1.32  {d24-d27}, [%[c]]!   \n"
+        "vst1.32  {d28-d31}, [%[c]]!   \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [bias] "r" (bias_ptr)
+        : "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15", "cc", "memory"
+      );
+      b += 4 * 8;
+    }
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "vld1.32  {d24-d25}, [%[bias]] \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11},  [%[a]]!   \n"
+        /* mov bias to c0-c3*/
+        "vmov.u32   q8,   q12 \n"
+        "vmov.u32   q9,   q12 \n"
+        "vmov.u32   q10,  q12 \n"
+        "vmov.u32   q11,  q12 \n"
+        "vmov.u32   q13,  #0  \n"
+        "1:\n"
+        /* load b0-b3 */
+        "vld1.32  {d0-d3},  [%[b]]! \n"
+        "vld1.32  {d4-d7},  [%[b]]! \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15}, [%[a]]!\n"
+        "vmla.f32  q8,   q4, d0[0]  \n"
+        "vmla.f32  q9,   q4, d2[0]  \n"
+        "vmla.f32  q10,  q4, d4[0]  \n"
+        "vmla.f32  q11,  q4, d6[0]  \n"
+        "sub  %[b], %[b], #64       \n"
+        "vmla.f32  q8,   q5, d0[1]  \n"
+        "vmla.f32  q9,   q5, d2[1]  \n"
+        "vmla.f32  q10,  q5, d4[1]  \n"
+        "vmla.f32  q11,  q5, d6[1]  \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32  q8,   q6, d1[0]  \n"
+        "vmla.f32  q9,   q6, d3[0]  \n"
+        "vmla.f32  q10,  q6, d5[0]  \n"
+        "vmla.f32  q11,  q6, d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]! \n"
+        "vmla.f32  q8,   q7, d1[1]  \n"
+        "vmla.f32  q9,   q7, d3[1]  \n"
+        "vmla.f32  q10,  q7, d5[1]  \n"
+        "vmla.f32  q11,  q7, d7[1]  \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "bne  1b                    \n"
+        "cmp  %[relu],  #0          \n"
+        "beq  2f                    \n"
+        "vmax.f32 q8,   q8,   q13   \n"
+        "vmax.f32 q9,   q9,   q13   \n"
+        "vmax.f32 q10,  q10,  q13   \n"
+        "vmax.f32 q11,  q11,  q13   \n"
+        "2:\n"
+        "vst1.32  {d16-d19}, [%[c]]!\n"
+        "vst1.32  {d20-d23}, [%[c]]!\n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [bias] "r" (bias_ptr)
+        : "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q8", "q9", "q10", "q11",
+          "q12", "q13", "cc", "memory"
+      );
+      b += 4 * 4;
+    }
+    for (; n > 0; n--) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "vld1.32  {d14-d15}, [%[bias]] \n"
+        "vmov.u32   q8,   #0  \n"
+        /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]! \n"
+        /* mov bias to c0 */
+        "vmov.u32   q5,   q7  \n"
+        "vmov.u32   q6,   q8  \n"
+        "1:\n"
+        /* load b0 */
+        "vld1.32  {d0-d1},  [%[b]]! \n"
+        /* load a2, a3 */
+        "vld1.32  {d6-d9},  [%[a]]! \n"
+        "vmla.f32   q5, q1, d0[0]   \n"
+        "vmla.f32   q6, q2, d0[1]   \n"
+        "sub  %[b], %[b],   #16     \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32   q5, q3, d1[0]   \n"
+        "vmla.f32   q6, q4, d1[1]   \n"
+         /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]!  \n"
+        "bne  1b                    \n"
+        "vadd.f32   q5, q5,   q6    \n"
+        "cmp  %[relu],  #0          \n"
+        "beq  2f                    \n"
+        "vmax.f32   q5, q5,   q8    \n"
+        "2:\n"
+        "vst1.32  {d10-d11}, [%[c]]!\n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [bias] "r" (bias_ptr)
+        : "q0", "q1", "q2", "q3", "q4", 
+          "q5", "q6", "q7", "q8", "cc", "memory"
+      );
+      // clang-format on
+      b += 4;
+    }
+#endif
+    bias_ptr += 4;
+    A_packed += lda;
+  }
+}
+
+void sgemm_prepack_c4_small(int M,
+                            int N,
+                            int K,
+                            const float* A_packed,
+                            const float* B,
+                            float* C,
+                            ARMContext* ctx) {
+  const int m_round = (M + 3) / 4 * 4;
+  const int k_round = (K + 3) / 4 * 4;
+  const int mloop = m_round >> 2;
+  const int lda = 4 * k_round;
+  const int ldb_byte = 4 * N * sizeof(float);
+  const int kcnt = k_round >> 2;
+#ifdef __aarch64__
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#endif
+  for (int m = 0; m < mloop; ++m) {
+    const float* b = B;
+    int n = N;
+#ifdef __aarch64__
+    for (; n > 7; n -= 8) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        /* load b2, b3 */
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        /* load a2, a3 */
+        "fmul v8.4s,  v16.4s, v0.s[0] \n"
+        "fmul v9.4s,  v16.4s, v1.s[0] \n"
+        "fmul v10.4s, v16.4s, v2.s[0] \n"
+        "fmul v11.4s, v16.4s, v3.s[0] \n"
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "prfm pldl1keep, [%[b]]       \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        /* load b4, b5 */
+        "ld1  {v4.4s, v5.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load b6, b7 */
+        "ld1  {v6.4s, v7.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "sub  %[b],   %[b],   #128    \n"
+        "fmul v12.4s, v16.4s, v4.s[0] \n"
+        "fmul v13.4s, v16.4s, v5.s[0] \n"
+        "fmul v14.4s, v16.4s, v6.s[0] \n"
+        "fmul v15.4s, v16.4s, v7.s[0] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v12.4s, v17.4s, v4.s[1] \n"
+        "fmla v13.4s, v17.4s, v5.s[1] \n"
+        "fmla v14.4s, v17.4s, v6.s[1] \n"
+        "fmla v15.4s, v17.4s, v7.s[1] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v12.4s, v18.4s, v4.s[2] \n"
+        "fmla v13.4s, v18.4s, v5.s[2] \n"
+        "fmla v14.4s, v18.4s, v6.s[2] \n"
+        "fmla v15.4s, v18.4s, v7.s[2] \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        "fmla v12.4s, v19.4s, v4.s[3] \n"
+        "fmla v13.4s, v19.4s, v5.s[3] \n"
+        "fmla v14.4s, v19.4s, v6.s[3] \n"
+        "fmla v15.4s, v19.4s, v7.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "beq  2f                      \n"
+        "1:\n"
+        /* load b2, b3 */
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v16.4s, v1.s[0] \n"
+        "fmla v10.4s, v16.4s, v2.s[0] \n"
+        "fmla v11.4s, v16.4s, v3.s[0] \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "prfm pldl1keep, [%[b]]       \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        /* load b4, b5 */
+        "ld1  {v4.4s, v5.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load b6, b7 */
+        "ld1  {v6.4s, v7.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "sub  %[b],   %[b],   #128    \n"
+        "fmla v12.4s, v16.4s, v4.s[0] \n"
+        "fmla v13.4s, v16.4s, v5.s[0] \n"
+        "fmla v14.4s, v16.4s, v6.s[0] \n"
+        "fmla v15.4s, v16.4s, v7.s[0] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v12.4s, v17.4s, v4.s[1] \n"
+        "fmla v13.4s, v17.4s, v5.s[1] \n"
+        "fmla v14.4s, v17.4s, v6.s[1] \n"
+        "fmla v15.4s, v17.4s, v7.s[1] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v12.4s, v18.4s, v4.s[2] \n"
+        "fmla v13.4s, v18.4s, v5.s[2] \n"
+        "fmla v14.4s, v18.4s, v6.s[2] \n"
+        "fmla v15.4s, v18.4s, v7.s[2] \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        "fmla v12.4s, v19.4s, v4.s[3] \n"
+        "fmla v13.4s, v19.4s, v5.s[3] \n"
+        "fmla v14.4s, v19.4s, v6.s[3] \n"
+        "fmla v15.4s, v19.4s, v7.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "bne  1b                      \n"
+        "2:\n"
+        "st1  {v8.4s,  v9.4s,  v10.4s, v11.4s}, [%[c]], #64 \n"
+        "st1  {v12.4s, v13.4s, v14.4s, v15.4s}, [%[c]], #64 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte),
+          [vzero] "w" (vzero)
+        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+          "v19", "cc", "memory"
+      );
+      b += 4 * 8;
+    }
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* load b0-b3 */
+        "ld1  {v0.4s,  v1.4s},  [%[b]], #32 \n"
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        "fmul v8.4s,  v16.4s, v0.s[0] \n"
+        "fmul v9.4s,  v16.4s, v1.s[0] \n"
+        "fmul v10.4s, v16.4s, v2.s[0] \n"
+        "fmul v11.4s, v16.4s, v3.s[0] \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "sub  %[b],   %[b],   #64     \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "beq  2f                      \n"
+        "1:\n"
+        /* load b0-b3 */
+        "ld1  {v0.4s,  v1.4s},  [%[b]], #32 \n"
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v16.4s, v1.s[0] \n"
+        "fmla v10.4s, v16.4s, v2.s[0] \n"
+        "fmla v11.4s, v16.4s, v3.s[0] \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "sub  %[b],   %[b],   #64     \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "bne  1b                      \n"
+        "2:\n"
+        "st1  {v8.4s,  v9.4s,  v10.4s, v11.4s}, [%[c]], #64 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte),
+          [vzero] "w" (vzero)
+        : "v0", "v1", "v2", "v3", "v8", "v9",
+          "v10", "v11", "v16", "v17", "v18",
+          "v19", "cc", "memory"
+      );
+      b += 4 * 4;
+    }
+    for (; n > 0; n--) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* load b0 */
+        "ld1  {v0.4s},  [%[b]], #16  \n"
+        "fmul v8.4s,  v16.4s, v0.s[0] \n"
+        "fmul v9.4s,  v17.4s, v0.s[1] \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "sub  %[b],   %[b],   #16     \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v19.4s, v0.s[3] \n"
+         /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "beq  2f                      \n"
+        "1:\n"
+        /* load b0 */
+        "ld1  {v0.4s},  [%[b]], #16  \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v17.4s, v0.s[1] \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "sub  %[b],   %[b],   #16     \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v19.4s, v0.s[3] \n"
+         /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "bne  1b                      \n"
+        "2:\n"
+        "fadd v8.4s,  v8.4s,  v9.4s   \n"
+        "st1  {v8.4s}, [%[c]], #16    \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte),
+          [vzero] "w" (vzero)
+        : "v0", "v8", "v9", "v16", "v17", 
+          "v18", "v19", "cc", "memory"
+      );
+      b += 4;
+    }
+#else
+    for (; n > 7; n -= 8) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]!  \n"
+        "vld1.32  {d0-d3}, [%[b]]! \n"
+        /* load b2, b3 */
+        "vld1.32    {d4-d7},   [%[b]]! \n"
+        "vmul.f32   q8,   q4,   d0[0]  \n"
+        "vmul.f32   q9,   q4,   d2[0]  \n"
+        "vmul.f32   q10,  q4,   d4[0]  \n"
+        "vmul.f32   q11,  q4,   d6[0]  \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15},   [%[a]]! \n"
+        "pld    [%[b]]                 \n"
+        "vmla.f32   q8,   q5,   d0[1]  \n"
+        "vmla.f32   q9,   q5,   d2[1]  \n"
+        "vmla.f32   q10,  q5,   d4[1]  \n"
+        "vmla.f32   q11,  q5,   d6[1]  \n"
+        "subs   %[cnt],   %[cnt],  #1  \n"
+        "vmla.f32   q8,   q6,   d1[0]  \n"
+        "vmla.f32   q9,   q6,   d3[0]  \n"
+        "vmla.f32   q10,  q6,   d5[0]  \n"
+        "vmla.f32   q11,  q6,   d7[0]  \n"
+        "pld    [%[b], #64]            \n"
+        "vmla.f32   q8,   q7,   d1[1]  \n"
+        "vmla.f32   q9,   q7,   d3[1]  \n"
+        /* load b4, b5 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q10,  q7,   d5[1]  \n"
+        "vmla.f32   q11,  q7,   d7[1]  \n"
+        /* load b6, b7 */
+        "vld1.32    {d4-d7},  [%[b]]!  \n"
+        "vmul.f32   q12,  q4,   d0[0]  \n"
+        "vmul.f32   q13,  q4,   d2[0]  \n"
+        "vmul.f32   q14,  q4,   d4[0]  \n"
+        "vmul.f32   q15,  q4,   d6[0]  \n"
+        "sub  %[b],   %[b],   #128     \n"
+        "vmla.f32   q12,  q5,   d0[1]  \n"
+        "vmla.f32   q13,  q5,   d2[1]  \n"
+        "vmla.f32   q14,  q5,   d4[1]  \n"
+        "vmla.f32   q15,  q5,   d6[1]  \n"
+        "add  %[b],   %[b],   %[ldb]   \n"
+        "vmla.f32   q12,  q6,   d1[0]  \n"
+        "vmla.f32   q13,  q6,   d3[0]  \n"
+        "vmla.f32   q14,  q6,   d5[0]  \n"
+        "vmla.f32   q15,  q6,   d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32    {d8-d11}, [%[a]]!  \n"
+        "vmla.f32   q12,  q7,   d1[1]  \n"
+        "vmla.f32   q13,  q7,   d3[1]  \n"
+        /* load b0, b1 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q14,  q7,   d5[1]  \n"
+        "vmla.f32   q15,  q7,   d7[1]  \n"
+        "beq  2f                       \n"
+        "1:\n"
+        /* load b2, b3 */
+        "vld1.32    {d4-d7},   [%[b]]! \n"
+        "vmla.f32   q8,   q4,   d0[0]  \n"
+        "vmla.f32   q9,   q4,   d2[0]  \n"
+        "vmla.f32   q10,  q4,   d4[0]  \n"
+        "vmla.f32   q11,  q4,   d6[0]  \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15},   [%[a]]! \n"
+        "pld    [%[b]]                 \n"
+        "vmla.f32   q8,   q5,   d0[1]  \n"
+        "vmla.f32   q9,   q5,   d2[1]  \n"
+        "vmla.f32   q10,  q5,   d4[1]  \n"
+        "vmla.f32   q11,  q5,   d6[1]  \n"
+        "subs   %[cnt],   %[cnt],  #1  \n"
+        "vmla.f32   q8,   q6,   d1[0]  \n"
+        "vmla.f32   q9,   q6,   d3[0]  \n"
+        "vmla.f32   q10,  q6,   d5[0]  \n"
+        "vmla.f32   q11,  q6,   d7[0]  \n"
+        "pld    [%[b], #64]            \n"
+        "vmla.f32   q8,   q7,   d1[1]  \n"
+        "vmla.f32   q9,   q7,   d3[1]  \n"
+        /* load b4, b5 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q10,  q7,   d5[1]  \n"
+        "vmla.f32   q11,  q7,   d7[1]  \n"
+        /* load b6, b7 */
+        "vld1.32    {d4-d7},  [%[b]]!  \n"
+        "vmla.f32   q12,  q4,   d0[0]  \n"
+        "vmla.f32   q13,  q4,   d2[0]  \n"
+        "vmla.f32   q14,  q4,   d4[0]  \n"
+        "vmla.f32   q15,  q4,   d6[0]  \n"
+        "sub  %[b],   %[b],   #128     \n"
+        "vmla.f32   q12,  q5,   d0[1]  \n"
+        "vmla.f32   q13,  q5,   d2[1]  \n"
+        "vmla.f32   q14,  q5,   d4[1]  \n"
+        "vmla.f32   q15,  q5,   d6[1]  \n"
+        "add  %[b],   %[b],   %[ldb]   \n"
+        "vmla.f32   q12,  q6,   d1[0]  \n"
+        "vmla.f32   q13,  q6,   d3[0]  \n"
+        "vmla.f32   q14,  q6,   d5[0]  \n"
+        "vmla.f32   q15,  q6,   d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32    {d8-d11}, [%[a]]!  \n"
+        "vmla.f32   q12,  q7,   d1[1]  \n"
+        "vmla.f32   q13,  q7,   d3[1]  \n"
+        /* load b0, b1 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q14,  q7,   d5[1]  \n"
+        "vmla.f32   q15,  q7,   d7[1]  \n"
+        "bne  1b                       \n"
+        "2:\n"
+        "vst1.32  {d16-d19}, [%[c]]!   \n"
+        "vst1.32  {d20-d23}, [%[c]]!   \n"
+        "vst1.32  {d24-d27}, [%[c]]!   \n"
+        "vst1.32  {d28-d31}, [%[c]]!   \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte)
+        : "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15", "cc", "memory"
+      );
+      b += 4 * 8;
+    }
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11},  [%[a]]!   \n"
+        /* load b0-b3 */
+        "vld1.32  {d0-d3},  [%[b]]! \n"
+        "vld1.32  {d4-d7},  [%[b]]! \n"
+        "vmul.f32  q8,   q4, d0[0]  \n"
+        "vmul.f32  q9,   q4, d2[0]  \n"
+        "vmul.f32  q10,  q4, d4[0]  \n"
+        "vmul.f32  q11,  q4, d6[0]  \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15}, [%[a]]!\n"
+        "sub  %[b], %[b], #64       \n"
+        "vmla.f32  q8,   q5, d0[1]  \n"
+        "vmla.f32  q9,   q5, d2[1]  \n"
+        "vmla.f32  q10,  q5, d4[1]  \n"
+        "vmla.f32  q11,  q5, d6[1]  \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32  q8,   q6, d1[0]  \n"
+        "vmla.f32  q9,   q6, d3[0]  \n"
+        "vmla.f32  q10,  q6, d5[0]  \n"
+        "vmla.f32  q11,  q6, d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]! \n"
+        "vmla.f32  q8,   q7, d1[1]  \n"
+        "vmla.f32  q9,   q7, d3[1]  \n"
+        "vmla.f32  q10,  q7, d5[1]  \n"
+        "vmla.f32  q11,  q7, d7[1]  \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "beq  2f                    \n"
+        "1:\n"
+        /* load b0-b3 */
+        "vld1.32  {d0-d3},  [%[b]]! \n"
+        "vld1.32  {d4-d7},  [%[b]]! \n"
+        "vmla.f32  q8,   q4, d0[0]  \n"
+        "vmla.f32  q9,   q4, d2[0]  \n"
+        "vmla.f32  q10,  q4, d4[0]  \n"
+        "vmla.f32  q11,  q4, d6[0]  \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15}, [%[a]]!\n"
+        "sub  %[b], %[b], #64       \n"
+        "vmla.f32  q8,   q5, d0[1]  \n"
+        "vmla.f32  q9,   q5, d2[1]  \n"
+        "vmla.f32  q10,  q5, d4[1]  \n"
+        "vmla.f32  q11,  q5, d6[1]  \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32  q8,   q6, d1[0]  \n"
+        "vmla.f32  q9,   q6, d3[0]  \n"
+        "vmla.f32  q10,  q6, d5[0]  \n"
+        "vmla.f32  q11,  q6, d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]! \n"
+        "vmla.f32  q8,   q7, d1[1]  \n"
+        "vmla.f32  q9,   q7, d3[1]  \n"
+        "vmla.f32  q10,  q7, d5[1]  \n"
+        "vmla.f32  q11,  q7, d7[1]  \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "bne  1b                    \n"
+        "2:\n"
+        "vst1.32  {d16-d19}, [%[c]]!\n"
+        "vst1.32  {d20-d23}, [%[c]]!\n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte)
+        : "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q8", "q9", "q10", "q11",
+          "q12", "q13", "cc", "memory"
+      );
+      b += 4 * 4;
+    }
+    for (; n > 0; n--) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]! \n"
+        /* load b0 */
+        "vld1.32  {d0-d1},  [%[b]]! \n"
+        "vmul.f32   q5, q1, d0[0]   \n"
+        "vmul.f32   q6, q2, d0[1]   \n"
+        /* load a2, a3 */
+        "vld1.32  {d6-d9},  [%[a]]! \n"
+        "sub  %[b], %[b],   #16     \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32   q5, q3, d1[0]   \n"
+        "vmla.f32   q6, q4, d1[1]   \n"
+         /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]!  \n"
+        "beq  2f                    \n"
+        "1:\n"
+        /* load b0 */
+        "vld1.32  {d0-d1},  [%[b]]! \n"
+        "vmla.f32   q5, q1, d0[0]   \n"
+        "vmla.f32   q6, q2, d0[1]   \n"
+        /* load a2, a3 */
+        "vld1.32  {d6-d9},  [%[a]]! \n"
+        "sub  %[b], %[b],   #16     \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32   q5, q3, d1[0]   \n"
+        "vmla.f32   q6, q4, d1[1]   \n"
+         /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]!  \n"
+        "bne  1b                    \n"
+        "2:\n"
+        "vadd.f32   q5, q5,   q6    \n"
+        "vst1.32  {d10-d11}, [%[c]]!\n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte)
+        : "q0", "q1", "q2", "q3", "q4", 
+          "q5", "q6", "q7", "q8", "cc", "memory"
+      );
+      // clang-format on
+      b += 4;
+    }
+#endif
+    A_packed += lda;
+  }
+}
+
+void sgemm_prepack_c4(int M,
+                      int N,
+                      int K,
+                      const float* A_packed,
+                      const float* B,
+                      float* C,
+                      const float* bias,
+                      bool has_bias,
+                      bool has_relu,
+                      ARMContext* ctx) {
+  if (N > 16) {
+    sgemm_prepack_c4_common(
+        M, N, K, A_packed, B, C, bias, has_bias, has_relu, ctx);
+  } else {
+    sgemm_prepack_c4_small(
+        M, N, K, A_packed, B, C, bias, has_bias, has_relu, ctx);
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/packed_sgemm_c4.h b/lite/backends/arm/math/packed_sgemm_c4.h
new file mode 100644
index 0000000000000000000000000000000000000000..3229ff3e0774ce8bff02b12d79d7ec50ed873cea
--- /dev/null
+++ b/lite/backends/arm/math/packed_sgemm_c4.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+constexpr int MBLOCK_C4 = 4;
+constexpr int NBLOCK_C4 = 8;
+constexpr int KBLOCK_C4 = 4;
+
+void sgemm_prepack_c4(int M,
+                      int N,
+                      int K,
+                      const float* A_packed,
+                      const float* B,
+                      float* C,
+                      const float* bias,
+                      bool has_bias,
+                      bool has_relu,
+                      ARMContext* ctx);
+void sgemm_prepack_c4_small(int M,
+                            int N,
+                            int K,
+                            const float* A_packed,
+                            const float* B,
+                            float* C,
+                            const float* bias,
+                            bool has_bias,
+                            bool has_relu,
+                            ARMContext* ctx);
+void sgemm_prepack_c4_small(int M,
+                            int N,
+                            int K,
+                            const float* A_packed,
+                            const float* B,
+                            float* C,
+                            ARMContext* ctx);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
index a857e9830c54b568c93afa4c1aa119ed2baffa1e..07cbd00378c082e311e194c7b22b6d3cb195a63a 100644
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -46,7 +46,7 @@ void pooling_basic(const float* din,
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int size_channel_in = win * hin;
   int size_channel_out = wout * hout;
   if (global_pooling) {
@@ -125,18 +125,22 @@ void pooling_basic(const float* din,
                 int bh = kernel_h;
                 int bw = kernel_w;
                 if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                   bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                     bw += pad_w;
                   }
                 }
                 if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                   bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                     bh += pad_h;
                   }
                 }
@@ -163,7 +167,7 @@ void pooling_basic(const float* din,
   "ld1 {v2.4s-v3.4s}, [%[data_in_channel]], #32    \n" \
   "fmax v6.4s, v4.4s, v5.4s \n"                        \
   "subs %w[cnt], %w[cnt], #1 \n"                       \
-  "fmax %w[vmax].4s, %w[vmax].4s, v6.4s \n"            \
+  "fmax %[vmax].4s, %[vmax].4s, v6.4s \n"              \
   "bne 1b \n"
 #define GLOBAL_AVG                                  \
   "1: \n"                                           \
@@ -172,7 +176,7 @@ void pooling_basic(const float* din,
   "ld1 {v0.4s-v1.4s}, [%[data_in_channel]], #32 \n" \
   "fadd %[vsum].4s, %[vsum].4s, v3.4s \n"           \
   "subs %w[cnt], %w[cnt], #1 \n"                    \
-  "fadd %w[vsum].4s, %w[vsum].4s, v4.4s \n"         \
+  "fadd %[vsum].4s, %[vsum].4s, v4.4s \n"           \
   "ld1 {v2.4s-v3.4s}, [%[data_in_channel]], #32 \n" \
   "bne 1b \n"
 
@@ -894,6 +898,121 @@ void pooling_global_avg(const float* din,
   }
 }
 
+void pooling1x1s2p0_max(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win) {
+  int size_channel_out = wout * hout;
+  int size_channel_in = win * hin;
+  auto data_out = static_cast<float*>(dout);
+  auto data_in = static_cast<const float*>(din);
+
+  int w_unroll_size = wout / 4;
+  int w_unroll_remian = wout - w_unroll_size * 4;
+  int win_ext = w_unroll_size * 8;
+  auto zero_ptr =
+      static_cast<float*>(TargetMalloc(TARGET(kARM), win * sizeof(float)));
+  memset(zero_ptr, 0, win * sizeof(float));
+  auto write_ptr =
+      static_cast<float*>(TargetMalloc(TARGET(kARM), wout * sizeof(float)));
+
+  for (int n = 0; n < num; ++n) {
+    float* data_out_batch = data_out + n * chout * size_channel_out;
+    const float* data_in_batch = data_in + n * chin * size_channel_in;
+#pragma omp parallel for
+    for (int c = 0; c < chout; c++) {
+      float* data_out_channel = data_out_batch + c * size_channel_out;
+      const float* data_in_channel = data_in_batch + c * size_channel_in;
+      for (int h = 0; h < hout; h += 4) {
+        const float* din0_ptr = data_in_channel + h * 2 * win;
+        const float* din1_ptr = din0_ptr + 2 * win;
+        const float* din2_ptr = din1_ptr + 2 * win;
+        const float* din3_ptr = din2_ptr + 2 * win;
+
+        float* doutr0 = data_out_channel + h * wout;
+        float* doutr1 = doutr0 + wout;
+        float* doutr2 = doutr1 + wout;
+        float* doutr3 = doutr2 + wout;
+        if (h + 4 > hout) {
+          switch (h + 4 - hout) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+        if (h * 2 + 7 > hin) {
+          switch (h * 2 + 7 - hin) {
+            case 7:
+              din0_ptr = zero_ptr;
+            case 6:
+            case 5:
+              din1_ptr = zero_ptr;
+            case 4:
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+            case 1:
+              din3_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        for (int i = 0; i < w_unroll_size; i++) {
+          float32x4x2_t din0 = vld2q_f32(din0_ptr);
+          float32x4x2_t din1 = vld2q_f32(din1_ptr);
+          float32x4x2_t din2 = vld2q_f32(din2_ptr);
+          float32x4x2_t din3 = vld2q_f32(din3_ptr);
+          din0_ptr += 8;
+          din1_ptr += 8;
+          din2_ptr += 8;
+          din3_ptr += 8;
+
+          vst1q_f32(doutr0, din0.val[0]);
+          vst1q_f32(doutr1, din1.val[0]);
+          vst1q_f32(doutr2, din2.val[0]);
+          vst1q_f32(doutr3, din3.val[0]);
+
+          doutr0 += 4;
+          doutr1 += 4;
+          doutr2 += 4;
+          doutr3 += 4;
+        }
+        int j = win_ext;
+        for (int i = 0; i < w_unroll_remian; i++) {
+          if (j >= win) {
+            *doutr0++ = 0.f;
+            *doutr1++ = 0.f;
+            *doutr2++ = 0.f;
+            *doutr3++ = 0.f;
+          } else {
+            *doutr0++ = *din0_ptr;
+            *doutr1++ = *din1_ptr;
+            *doutr2++ = *din2_ptr;
+            *doutr3++ = *din3_ptr;
+            din0_ptr += 2;
+            din1_ptr += 2;
+            din2_ptr += 2;
+            din3_ptr += 2;
+          }
+          j += 2;
+        }
+      }
+    }
+  }
+  TargetFree(TARGET(kARM), zero_ptr);
+  TargetFree(TARGET(kARM), write_ptr);
+}
+
 void pooling2x2s2_max(const float* din,
                       float* dout,
                       int num,
diff --git a/lite/backends/arm/math/pooling.h b/lite/backends/arm/math/pooling.h
index 9288f27bbc7519f1b06bfa1f119a21a33611f74c..701732cb453bfc9f2e970c83c8d713e70a205434 100644
--- a/lite/backends/arm/math/pooling.h
+++ b/lite/backends/arm/math/pooling.h
@@ -64,6 +64,16 @@ void pooling_global_avg(const float* din,
                         int hin,
                         int win);
 
+void pooling1x1s2p0_max(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win);
+
 void pooling2x2s2_max(const float* din,
                       float* dout,
                       int num,
diff --git a/lite/backends/arm/math/reduce_prod.cc b/lite/backends/arm/math/reduce_prod.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7b3f7095f2087af365d0765f49df7902df42bb9
--- /dev/null
+++ b/lite/backends/arm/math/reduce_prod.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/arm/math/reduce_prod.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/reduce_prod.h b/lite/backends/arm/math/reduce_prod.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c8898288fa498a6f97709a27306e6975dffc975
--- /dev/null
+++ b/lite/backends/arm/math/reduce_prod.h
@@ -0,0 +1,185 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void reduce_prod_n(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index, src_index0;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = static_cast<T>(1);
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void reduce_prod_c(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = static_cast<T>(1);
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void reduce_prod_h(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = static_cast<T>(1);
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void reduce_prod_w(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = static_cast<T>(1);
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void reduce_prod_nc(const T* src,
+                    T* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  auto* tmp_out = tensor_tmp.mutable_data<T>();
+  reduce_prod_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+
+template <typename T>
+void reduce_prod_ch(const T* src,
+                    T* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  auto* tmp_out = tensor_tmp.mutable_data<T>();
+  reduce_prod_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+
+template <typename T>
+void reduce_prod_hw(const T* src,
+                    T* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  auto* tmp_out = tensor_tmp.mutable_data<T>();
+  reduce_prod_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+
+template <typename T>
+void reduce_prod_all(const T* src, T* dst, int64_t total_num) {
+  dst[0] = static_cast<T>(1);
+  for (int n = 0; n < total_num; ++n) {
+    dst[0] *= src[n];
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/sgemm.cc b/lite/backends/arm/math/sgemm.cc
index f3123ddd718ee61b6430d2b7f14480b79435291a..f2ba090222d491f4032aaf4cf3dbb29b4c53708d 100644
--- a/lite/backends/arm/math/sgemm.cc
+++ b/lite/backends/arm/math/sgemm.cc
@@ -34,7 +34,7 @@ void sgemm(bool is_transA,
            int ldc,
            const float* bias,
            bool is_bias,
-           bool is_relu,
+           const operators::ActivationParam act_param,
            ARMContext* ctx) {
   int hblock = get_hblock(ctx);
   int m_roundup = hblock * ((M + hblock - 1) / hblock);
@@ -56,7 +56,7 @@ void sgemm(bool is_transA,
                 ldc,
                 bias,
                 is_bias,
-                is_relu,
+                act_param,
                 ctx);
   TargetFree(TargetType::kARM, packed_A);
 }
diff --git a/lite/backends/arm/math/sgemm.h b/lite/backends/arm/math/sgemm.h
index 08f68fb3d41e5d0a837f57a8d28acd82dd3f8cb4..b48080855fa8eedad9d619c1fbc84c9fd0040504 100644
--- a/lite/backends/arm/math/sgemm.h
+++ b/lite/backends/arm/math/sgemm.h
@@ -39,7 +39,7 @@ void sgemm(bool is_transA,
            int ldc,
            const float* bias,
            bool is_bias,
-           bool is_relu,
+           const operators::ActivationParam act_param,
            ARMContext* ctx);
 
 }  // namespace math
diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc
index 1830423136cc883d30d4eecad0eb9fcfc9ded6ba..98404fe60fdb1384d390458e10dac8c967fd2b21 100644
--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
@@ -22,35 +22,87 @@ namespace lite {
 namespace arm {
 namespace math {
 
-void sgemv(const bool transA,
-           const int M,
+void sgemv(const int M,
            const int N,
            const float *A,
            const float *x,
-           float *y);
-
-void sgemv_relu(const bool transA,
-                const int M,
-                const int N,
-                const float *A,
-                const float *x,
-                float *y);
+           float *y,
+           bool flag_bias,
+           const float *bias);
 
-void sgemv_bias(const bool transA,
-                const int M,
+void sgemv_relu(const int M,
                 const int N,
                 const float *A,
                 const float *x,
                 float *y,
+                bool flag_bias,
                 const float *bias);
 
-void sgemv_bias_relu(const bool transA,
-                     const int M,
-                     const int N,
-                     const float *A,
-                     const float *x,
-                     float *y,
-                     const float *bias);
+void sgemv_relu6(const int M,
+                 const int N,
+                 const float *A,
+                 const float *x,
+                 float *y,
+                 bool flag_bias,
+                 const float *bias,
+                 const float six);
+
+void sgemv_leakey_relu(const int M,
+                       const int N,
+                       const float *A,
+                       const float *x,
+                       float *y,
+                       bool flag_bias,
+                       const float *bias,
+                       const float alpha);
+
+void sgemv_trans(const int M,
+                 const int N,
+                 const float *A,
+                 const float *x,
+                 float *y,
+                 bool flag_bias,
+                 const float *bias,
+                 bool flag_act,
+                 lite_api::ActivationType act,
+                 const ARMContext *ctx,
+                 float six,
+                 float alpha);
+
+bool sgemv(const float *A,
+           const float *x,
+           float *y,
+           bool transA,
+           int M,
+           int N,
+           bool is_bias,
+           const float *bias,
+           bool flag_act,
+           lite_api::ActivationType act,
+           const ARMContext *ctx,
+           float six,
+           float alpha) {
+  if (transA) {
+    sgemv_trans(M, N, A, x, y, is_bias, bias, flag_act, act, ctx, six, alpha);
+  } else {
+    if (flag_act) {
+      if (act == lite_api::ActivationType::kRelu) {
+        sgemv_relu(M, N, A, x, y, is_bias, bias);
+      } else if (act == lite_api::ActivationType::kRelu6) {
+        sgemv_relu6(M, N, A, x, y, is_bias, bias, six);
+      } else if (act == lite_api::ActivationType::kLeakyRelu) {
+        sgemv_leakey_relu(M, N, A, x, y, is_bias, bias, alpha);
+      } else {
+        LOG(FATAL)
+            << "sgemv no transA only support relu, relu6, leakey relu fusion";
+      }
+    } else {
+      sgemv(M, N, A, x, y, is_bias, bias);
+    }
+  }
+  return true;
+}
+
 #ifdef __aarch64__
 void sgemv_trans(const int M,
                  const int N,
@@ -59,8 +111,11 @@ void sgemv_trans(const int M,
                  float *y,
                  bool flag_bias,
                  const float *bias,
-                 bool flag_relu,
-                 const ARMContext *ctx) {
+                 bool flag_act,
+                 lite_api::ActivationType act,
+                 const ARMContext *ctx,
+                 float six,
+                 float alpha) {
   int m_cnt16 = M >> 4;
   int m_cnt8 = (M & 15) >> 3;
   int m_cnt4 = (M & 15 & 7) >> 2;
@@ -281,26 +336,70 @@ void sgemv_trans(const int M,
     valid_ths = rdc_ths;
     rdc_ths = rdc_ths >> 1;
   }
-  if (flag_relu) {
+  if (flag_act) {
     float *in_y = y_buf;
     float32x4_t vzero = vdupq_n_f32(0.f);
-    if (cnt4 > 0) {
-      int cnt = cnt4;
-      asm volatile(
-          "ld1  {v0.4s},  [%[in_y]], #16  \n" /*  load y to v0    */
-          "1:\n"
-          "fmax v1.4s, v0.4s, %[vzero].4s \n" /*      v0 relu     */
-          "ld1  {v0.4s},  [%[in_y]], #16  \n" /*   load y to v0   */
-          "subs %w[cnt],  %w[cnt], #1     \n" /*      sub cnt     */
-          "st1  {v1.4s},  [%[out_y]], #16 \n" /*  store v1 to y   */
-          "bne  1b                        \n" /* branch to label 1*/
-          "sub  %[in_y],  %[in_y],  #16   \n" /*   restore in_y   */
-          : [cnt] "+r"(cnt), [in_y] "+r"(in_y), [out_y] "+r"(y)
-          : [vzero] "w"(vzero)
-          : "v0", "v1", "cc", "memory");
-    }
-    for (int r = 0; r < remain; ++r) {
-      y[r] = in_y[r] > 0.f ? in_y[r] : 0.f;
+    if (act == lite_api::ActivationType::kRelu) {
+      if (cnt4 > 0) {
+        int cnt = cnt4;
+        asm volatile(
+            "ld1  {v0.4s},  [%[in_y]], #16  \n" /*  load y to v0    */
+            "1:\n"
+            "fmax v1.4s, v0.4s, %[vzero].4s \n" /*      v0 relu     */
+            "ld1  {v0.4s},  [%[in_y]], #16  \n" /*   load y to v0   */
+            "subs %w[cnt],  %w[cnt], #1     \n" /*      sub cnt     */
+            "st1  {v1.4s},  [%[out_y]], #16 \n" /*  store v1 to y   */
+            "bne  1b                        \n" /* branch to label 1*/
+            "sub  %[in_y],  %[in_y],  #16   \n" /*   restore in_y   */
+            : [cnt] "+r"(cnt), [in_y] "+r"(in_y), [out_y] "+r"(y)
+            : [vzero] "w"(vzero)
+            : "v0", "v1", "cc", "memory");
+      }
+      for (int r = 0; r < remain; ++r) {
+        y[r] = in_y[r] > 0.f ? in_y[r] : 0.f;
+      }
+    } else if (act == lite_api::ActivationType::kRelu6) {
+      float32x4_t vsix = vdupq_n_f32(six);
+      if (cnt4 > 0) {
+        int cnt = cnt4;
+        asm volatile(
+            "ld1  {v0.4s},  [%[in_y]], #16  \n" /*  load y to v0    */
+            "1:\n"
+            "fmax v1.4s, v0.4s, %[vzero].4s \n" /*      v0 relu6    */
+            "fmin v1.4s, v1.4s, %[vsix].4s  \n" /*      v1 relu6    */
+            "ld1  {v0.4s},  [%[in_y]], #16  \n" /*   load y to v0   */
+            "subs %w[cnt],  %w[cnt], #1     \n" /*      sub cnt     */
+            "st1  {v1.4s},  [%[out_y]], #16 \n" /*  store v1 to y   */
+            "bne  1b                        \n" /* branch to label 1*/
+            "sub  %[in_y],  %[in_y],  #16   \n" /*   restore in_y   */
+            : [cnt] "+r"(cnt), [in_y] "+r"(in_y), [out_y] "+r"(y)
+            : [vzero] "w"(vzero), [vsix] "w"(vsix)
+            : "v0", "v1", "cc", "memory");
+      }
+      for (int r = 0; r < remain; ++r) {
+        y[r] = in_y[r] > 0.f ? in_y[r] : 0.f;
+        y[r] = y[r] > six ? six : y[r];
+      }
+    } else if (act == lite_api::ActivationType::kLeakyRelu) {
+      float32x4_t valpha = vdupq_n_f32(alpha);
+      if (cnt4 > 0) {
+        int cnt = cnt4;
+        asm volatile(
+            "1:\n"
+            "ld1   {v0.4s},  [%[in_y]],   #16 \n" /*   load y to v0   */
+            "fcmge v4.4s, v0.4s,  %[vzero].4s \n" /*    vcgeq_f32     */
+            "fmul  v5.4s, v0.4s, %[valpha].4s \n" /*    vmulq_f32     */
+            "bif   v0.16b,   v5.16b,   v4.16b \n" /*      choose      */
+            "subs  %w[cnt],  %w[cnt], #1      \n" /*      sub cnt     */
+            "st1   {v0.4s},  [%[out_y]], #16  \n" /*  store v0 to y   */
+            "bne   1b                         \n" /* branch to label 1*/
+            : [cnt] "+r"(cnt), [in_y] "+r"(in_y), [out_y] "+r"(y)
+            : [vzero] "w"(vzero), [valpha] "w"(valpha)
+            : "v0", "v4", "v5", "cc", "memory");
+      }
+      for (int r = 0; r < remain; ++r) {
+        y[r] = in_y[r] < 0.f ? alpha * in_y[r] : in_y[r];
+      }
     }
   } else {
     memcpy(y, y_buf, M * sizeof(float));
@@ -314,8 +413,11 @@ void sgemv_trans(const int M,
                  float *y,
                  bool flag_bias,
                  const float *bias,
-                 bool flag_relu,
-                 const ARMContext *ctx) {
+                 bool flag_act,
+                 lite_api::ActivationType act,
+                 const ARMContext *ctx,
+                 float six,
+                 float alpha) {
   int m_cnt8 = M >> 3;
   int m_cnt4 = (M & 7) >> 2;
   int m_remain = M & 7 & 3;
@@ -497,43 +599,73 @@ void sgemv_trans(const int M,
     valid_ths = rdc_ths;
     rdc_ths = rdc_ths >> 1;
   }
-  if (flag_relu) {
+  // do activation
+  if (flag_act) {
     float *in_y = y_buf;
     float32x4_t vzero = vdupq_n_f32(0.f);
-    if (m_cnt8 > 0) {
-      int cnt8 = m_cnt8;
-      asm volatile(
-          "vld1.32  {d0-d3},  [%[in_y]]!  \n" /* load y to q0, q1 */
-          "1:\n"
-          "vmax.f32 q2, q0,   %q[vzero]   \n" /*      q0 relu     */
-          "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*   load y to q0   */
-          "vmax.f32 q3, q1,   %q[vzero]   \n" /*      q1 relu     */
-          "subs %[cnt], %[cnt], #1        \n" /*      sub cnt     */
-          "vst1.32  {d4-d7},  [%[out_y]]! \n" /* store q0, q1 to y*/
-          "vld1.32  {d2-d3},  [%[in_y]]!  \n" /*   load y to q0   */
-          "bne  1b                        \n" /* branch to label 1*/
-          "sub  %[in_y],  %[in_y],  #32   \n" /*   restore in_y   */
-          : [cnt] "+r"(cnt8), [in_y] "+r"(in_y), [out_y] "+r"(y)
-          : [vzero] "w"(vzero)
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-    }
-    if (m_cnt4 > 0) {
-      int cnt4 = m_cnt4;
-      asm volatile(
-          "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*  load y to q0    */
-          "1:\n"
-          "vmax.f32 q1, q0,   %q[vzero]   \n" /*      q0 relu     */
-          "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*   load y to q0   */
-          "subs %[cnt], %[cnt], #1        \n" /*      sub cnt     */
-          "vst1.32  {d2-d3},  [%[out_y]]! \n" /*  store q1 to y   */
-          "bne  1b                        \n" /* branch to label 1*/
-          "sub  %[in_y],  %[in_y],  #16   \n" /*   restore in_y   */
-          : [cnt] "+r"(cnt4), [in_y] "+r"(in_y), [out_y] "+r"(y)
-          : [vzero] "w"(vzero)
-          : "q0", "q1", "cc", "memory");
-    }
-    for (int r = 0; r < m_remain; ++r) {
-      y[r] = in_y[r] > 0.f ? in_y[r] : 0.f;
+    m_cnt4 = M >> 2;
+    m_remain = M & 3;
+    if (act == lite_api::ActivationType::kRelu) {
+      if (m_cnt4 > 0) {
+        int cnt4 = m_cnt4;
+        asm volatile(
+            "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*  load y to q0    */
+            "1:\n"
+            "vmax.f32 q1, q0,   %q[vzero]   \n" /*      q0 relu     */
+            "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*   load y to q0   */
+            "subs %[cnt], %[cnt], #1        \n" /*      sub cnt     */
+            "vst1.32  {d2-d3},  [%[out_y]]! \n" /*  store q1 to y   */
+            "bne  1b                        \n" /* branch to label 1*/
+            "sub  %[in_y],  %[in_y],  #16   \n" /*   restore in_y   */
+            : [cnt] "+r"(cnt4), [in_y] "+r"(in_y), [out_y] "+r"(y)
+            : [vzero] "w"(vzero)
+            : "q0", "q1", "cc", "memory");
+      }
+      for (int r = 0; r < m_remain; ++r) {
+        y[r] = in_y[r] > 0.f ? in_y[r] : 0.f;
+      }
+    } else if (act == lite_api::ActivationType::kRelu6) {
+      float32x4_t vsix = vdupq_n_f32(six);
+      if (m_cnt4 > 0) {
+        int cnt4 = m_cnt4;
+        asm volatile(
+            "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*  load y to q0    */
+            "1:\n"
+            "vmax.f32 q1, q0,   %q[vzero]   \n" /*      q0 relu6    */
+            "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*   load y to q0   */
+            "vmin.f32 q1, q1,   %q[vsix]    \n" /*      q0 relu6    */
+            "subs %[cnt], %[cnt], #1        \n" /*      sub cnt     */
+            "vst1.32  {d2-d3},  [%[out_y]]! \n" /*  store q1 to y   */
+            "bne  1b                        \n" /* branch to label 1*/
+            "sub  %[in_y],  %[in_y],  #16   \n" /*   restore in_y   */
+            : [cnt] "+r"(cnt4), [in_y] "+r"(in_y), [out_y] "+r"(y)
+            : [vzero] "w"(vzero), [vsix] "w"(vsix)
+            : "q0", "q1", "cc", "memory");
+      }
+      for (int r = 0; r < m_remain; ++r) {
+        y[r] = in_y[r] > 0.f ? in_y[r] : 0.f;
+        y[r] = y[r] > six ? six : y[r];
+      }
+    } else if (act == lite_api::ActivationType::kLeakyRelu) {
+      float32x4_t valpha = vdupq_n_f32(alpha);
+      if (m_cnt4 > 0) {
+        int cnt4 = m_cnt4;
+        asm volatile(
+            "1:\n"
+            "vld1.32  {d0-d1}, [%[in_y]]!   \n" /*   load y to q0   */
+            "vcge.f32 q3, q0,  %q[vzero]    \n" /*    vcgeq_f32     */
+            "vmul.f32 q4, q0,  %q[valpha]   \n" /*    vmulq_f32     */
+            "vbif q0, q4, q3                \n" /*      choose      */
+            "subs %[cnt], %[cnt], #1        \n" /*      sub cnt     */
+            "vst1.32  {d0-d1}, [%[out_y]]!  \n" /*  store q0 to y   */
+            "bne  1b                        \n" /* branch to label 1*/
+            : [cnt] "+r"(cnt4), [in_y] "+r"(in_y), [out_y] "+r"(y)
+            : [vzero] "w"(vzero), [valpha] "w"(valpha)
+            : "q0", "q3", "q4", "cc", "memory");
+      }
+      for (int r = 0; r < m_remain; ++r) {
+        y[r] = in_y[r] < 0.f ? alpha * in_y[r] : in_y[r];
+      }
     }
   } else {
     memcpy(y, y_buf, M * sizeof(float));
@@ -541,41 +673,6 @@ void sgemv_trans(const int M,
 }
 #endif  // __aarch64__
 
-bool sgemv(const float *A,
-           const float *x,
-           float *y,
-           bool transA,
-           int M,
-           int N,
-           bool is_bias,
-           const float *bias,
-           bool is_relu,
-           const ARMContext *ctx) {
-  if (transA) {
-    sgemv_trans(M, N, A, x, y, is_bias, bias, is_relu, ctx);
-  } else {
-    if (is_bias) {
-      //! with bias
-      if (is_relu) {
-        //! with relu
-        sgemv_bias_relu(transA, M, N, A, x, y, bias);
-      } else {
-        //! without relu
-        sgemv_bias(transA, M, N, A, x, y, bias);
-      }
-    } else {
-      //! without bias
-      if (is_relu) {
-        //! with relu
-        sgemv_relu(transA, M, N, A, x, y);
-      } else {
-        //! without relu
-        sgemv(transA, M, N, A, x, y);
-      }
-    }
-  }
-  return true;
-}
 // clang-format off
 //! define compute kernel
 #ifdef __aarch64__
@@ -715,19 +812,19 @@ bool sgemv(const float *A,
 #define SGEMV_KERNEL_1                                                         \
   /* check main loop */                                                        \
   "cmp %w[cnt], #1            \n" /* check whether has main loop */            \
-  "blt  2f                    \n" /* jump to tail */ /* main loop */           \
-  "1:                         \n"                    /* main loop */           \
-  "ldp q8, q9, [%[in]], #32   \n"                    /* load input 8 float */  \
-  "ldp q10, q11, [%[w0]], #32 \n"                    /* load w0 8 float */     \
-  "fmla v0.4s, v8.4s, v10.4s  \n"                    /* mul + add*/            \
-  "subs %w[cnt], %w[cnt], #1  \n"                    /* sub main loop count */ \
-  "fmla v1.4s, v9.4s, v11.4s  \n"                    /* mul + add*/            \
+  "blt  2f                    \n" /* jump to tail */                           \
+  "1:                         \n" /* main loop */                              \
+  "ldp q8, q9, [%[in]], #32   \n" /* load input 8 float */                     \
+  "ldp q10, q11, [%[w0]], #32 \n" /* load w0 8 float */                        \
+  "fmla v0.4s, v8.4s, v10.4s  \n" /* mul + add*/                               \
+  "subs %w[cnt], %w[cnt], #1  \n" /* sub main loop count */                    \
+  "fmla v1.4s, v9.4s, v11.4s  \n" /* mul + add*/                               \
   "bne 1b                     \n" /* jump to main loop */                      \
   /* pair add to final result */                                               \
   "2:                         \n" /* reduce to scale */                        \
   "fadd   v9.4s, v0.4s, v1.4s \n" /* add 2 vector */                           \
   "faddp  v10.4s, v9.4s, v9.4s\n" /* pair add to vector */                     \
-  "faddp  s8, v10.2s          \n" /* pair add to scale */ /* check tails */    \
+  "faddp  s8, v10.2s          \n" /* pair add to scale */                      \
   "cmp %w[tail], #1           \n" /* check whether has tail */                 \
   "blt  4f                    \n" /* jump to end */                            \
   "3:                         \n" /* tail loop */                              \
@@ -737,43 +834,100 @@ bool sgemv(const float *A,
   "subs %w[tail], %w[tail], #1\n" /* sub tail loop count */                    \
   "bne 3b                     \n" /* jump to tail loop */
 
-#define SGEMV_OUT_8                                 \
-  /* end */                                         \
-  "4:                         \n" /* end */         \
-  "stp s8, s9, [%[out]]       \n" /* save result */ \
-  "stp s10, s11, [%[out], #8] \n" /* save result */ \
-  "stp s12, s13, [%[out], #16]\n" /* save result */ \
-  "stp s14, s15, [%[out], #24]\n" /* save result */
+#define SGEMV_OUT_8                                      \
+  /* end */                                              \
+  "4:                          \n" /* end */             \
+  "mov v8.s[1], v9.s[0]        \n" /* ins s9 to  v8[1]*/ \
+  "mov v8.s[2], v10.s[0]       \n" /* ins s10 to v8[2]*/ \
+  "mov v8.s[3], v11.s[0]       \n" /* ins s11 to v8[3]*/ \
+  "mov v9.s[0], v12.s[0]       \n" /* ins s12 to v9[0]*/ \
+  "mov v9.s[1], v13.s[0]       \n" /* ins s13 to v9[1]*/ \
+  "mov v9.s[2], v14.s[0]       \n" /* ins s14 to v9[2]*/ \
+  "mov v9.s[3], v15.s[0]       \n" /* ins s15 to v9[3]*/ \
+  "stp q8, q9, [%[out]]        \n" /* save result */
 
 #define SGEMV_OUT_8_RELU                                   \
   /* end */                                                \
-  "4:                         \n" /* end */                \
-  "movi   d0, #0              \n" /* zero data for relu */ \
-  "fmax   s8, s8, s0          \n" /* relu */               \
-  "fmax   s9, s9, s0          \n" /* relu */               \
-  "fmax   s10, s10, s0        \n" /* relu */               \
-  "fmax   s11, s11, s0        \n" /* relu */               \
-  "fmax   s12, s12, s0        \n" /* relu */               \
-  "fmax   s13, s13, s0        \n" /* relu */               \
-  "fmax   s14, s14, s0        \n" /* relu */               \
-  "fmax   s15, s15, s0        \n" /* relu */               \
-  "stp s8, s9, [%[out]]       \n" /* save result */        \
-  "stp s10, s11, [%[out], #8] \n" /* save result */        \
-  "stp s12, s13, [%[out], #16]\n" /* save result */        \
-  "stp s14, s15, [%[out], #24]\n" /* save result */
+  "4:                          \n" /* end */               \
+  "mov v8.s[1], v9.s[0]        \n" /* ins s9 to  v8[1]*/   \
+  "mov v8.s[2], v10.s[0]       \n" /* ins s10 to v8[2]*/   \
+  "mov v8.s[3], v11.s[0]       \n" /* ins s11 to v8[3]*/   \
+  "mov v9.s[0], v12.s[0]       \n" /* ins s12 to v9[0]*/   \
+  "mov v9.s[1], v13.s[0]       \n" /* ins s13 to v9[1]*/   \
+  "mov v9.s[2], v14.s[0]       \n" /* ins s14 to v9[2]*/   \
+  "mov v9.s[3], v15.s[0]       \n" /* ins s15 to v9[3]*/   \
+  "movi   v2.4s, #0            \n" /* zero data for relu */\
+  "fmax   v8.4s, v8.4s, v2.4s  \n" /* relu */              \
+  "fmax   v9.4s, v9.4s, v2.4s  \n" /* relu */              \
+  "stp q8, q9, [%[out]]        \n" /* save result */
 
-#define SGEMV_OUT_1                         \
-  /* end */                                 \
-  "4:                         \n" /* end */ \
+#define SGEMV_OUT_8_RELU6                                       \
+  /* end */                                                     \
+  "4:                              \n" /* end */                \
+  "mov v8.s[1], v9.s[0]            \n" /* ins s9 to  v8[1]*/    \
+  "mov v8.s[2], v10.s[0]           \n" /* ins s10 to v8[2]*/    \
+  "mov v8.s[3], v11.s[0]           \n" /* ins s11 to v8[3]*/    \
+  "mov v9.s[0], v12.s[0]           \n" /* ins s12 to v9[0]*/    \
+  "mov v9.s[1], v13.s[0]           \n" /* ins s13 to v9[1]*/    \
+  "mov v9.s[2], v14.s[0]           \n" /* ins s14 to v9[2]*/    \
+  "mov v9.s[3], v15.s[0]           \n" /* ins s15 to v9[3]*/    \
+  "movi   v2.4s, #0                \n" /* zero data for relu6 */\
+  "fmax   v8.4s, v8.4s, v2.4s      \n" /* relu6 */              \
+  "fmax   v9.4s, v9.4s, v2.4s      \n" /* relu6 */              \
+  "fmin   v8.4s, v8.4s, %[vsix].4s \n" /* relu */               \
+  "fmin   v9.4s, v9.4s, %[vsix].4s \n" /* relu */               \
+  "stp q8, q9, [%[out]]            \n" /* save result */
+
+#define SGEMV_OUT_8_LEAKEY_RELU                                         \
+  /* end */                                                             \
+  "4:                               \n" /* end */                       \
+  "mov v8.s[1], v9.s[0]             \n" /* ins s9 to  v8[1]*/           \
+  "mov v8.s[2], v10.s[0]            \n" /* ins s10 to v8[2]*/           \
+  "mov v8.s[3], v11.s[0]            \n" /* ins s11 to v8[3]*/           \
+  "mov v9.s[0], v12.s[0]            \n" /* ins s12 to v9[0]*/           \
+  "mov v9.s[1], v13.s[0]            \n" /* ins s13 to v9[1]*/           \
+  "mov v9.s[2], v14.s[0]            \n" /* ins s14 to v9[2]*/           \
+  "mov v9.s[3], v15.s[0]            \n" /* ins s15 to v9[3]*/           \
+  "movi   v2.4s, #0                 \n" /* zero data for leakey relu */ \
+  "fcmge v4.4s, v8.4s,  v2.4s       \n" /* vcgeq_f32 */                 \
+  "fmul v5.4s, v8.4s,  %[valpha].4s \n" /* vmulq_f32 */                 \
+  "fcmge v6.4s, v9.4s,  v2.4s       \n" /* vcgeq_f32 */                 \
+  "fmul v7.4s, v9.4s,  %[valpha].4s \n" /* vmulq_f32 */                 \
+  "bif v8.16b, v5.16b, v4.16b       \n" /* choose*/                     \
+  "bif v9.16b, v7.16b, v6.16b       \n" /* choose*/                     \
+  "stp q8, q9, [%[out]]             \n" /* save result */
+
+#define SGEMV_OUT_1                                 \
+  /* end */                                         \
+  "4:                         \n" /* end */         \
   "str s8, [%[out]]           \n" /* save result */
 
 #define SGEMV_OUT_1_RELU                                   \
   /* end */                                                \
   "4:                         \n" /* end */                \
-  "movi   d0, #0              \n" /* zero data for relu */ \
-  "fmax   s8, s8, s0          \n" /* relu */               \
+  "movi   d1, #0              \n" /* zero data for relu */ \
+  "fmax   s8, s8, s1          \n" /* relu */               \
+  "str s8, [%[out]]           \n" /* save result */
+
+#define SGEMV_OUT_1_RELU6                                   \
+  /* end */                                                 \
+  "4:                         \n" /* end */                 \
+  "movi   d1, #0              \n" /* zero data for relu6 */ \
+  "fmov   s2, %w[six]         \n" /* mov six to s2  */      \
+  "fmax   s8, s8, s1          \n" /* relu6 */               \
+  "fmin   s8, s8, s2          \n" /* relu6 */               \
   "str s8, [%[out]]           \n" /* save result */
 
+#define SGEMV_OUT_1_LEAKEY_RELU                             \
+  /* end */                                                 \
+  "4:                           \n" /* end */               \
+  "fmov   s1, %w[alpha]         \n" /* mov alpha to s1  */  \
+  "fcmp   s8, #0                \n" /* cmp with zero*/      \
+  "bge    5f                    \n" /* if ge zero */        \
+  "fmul   s8, s8, s1            \n" /* out * alpha */       \
+  "5:                           \n" /* leakey relu label */ \
+  "str s8, [%[out]]             \n" /* save result */
+
 #else  // __aarch64__
 
 #define SGEMV_IN_4                                                    \
@@ -841,14 +995,13 @@ bool sgemv(const float *A,
   "vmla.f32 q2, q5, q11           @ mul add\n"                                 \
   "vmla.f32 q3, q5, q13           @ mul add\n"                                 \
   "bne 1b                         @ jump to main loop\n"                       \
-  /* pair add to final result */                                               \
   "2:                             @ pair add \n"                               \
   "vpadd.f32 d8, d0, d1           @ pair add, first step\n"                    \
   "vpadd.f32 d9, d2, d3           @ pair add, first step\n"                    \
   "vpadd.f32 d10, d4, d5          @ pair add, first step\n"                    \
   "vpadd.f32 d11, d6, d7          @ pair add, first step\n"                    \
   "vpadd.f32 d0, d8, d9           @ pair add, second step\n"                   \
-  "vpadd.f32 d1, d10, d11         @ pair add, second step\n" /* check tails */ \
+  "vpadd.f32 d1, d10, d11         @ pair add, second step\n"                   \
   "cmp %[tail], #1                @ check whether has tail\n"                  \
   "blt  4f                        @ jump to end\n"                             \
   "3:                             @ tail loop\n"                               \
@@ -876,7 +1029,7 @@ bool sgemv(const float *A,
   "bne 1b                             @ jump to main loop\n"                   \
   "2:                                 @ end processing\n"                      \
   "vpadd.f32 d2, d0, d1               @ pair add, first step\n"                \
-  "vpadd.f32 d0, d2, d2               @ pair add, final step\n"/*check tails*/ \
+  "vpadd.f32 d0, d2, d2               @ pair add, final step\n"                \
   "cmp %[tail], #1                    @ check whether has mid cols\n"          \
   "blt  4f                            @ jump to end\n"                         \
   "3:                                 @ tail loop\n"                           \
@@ -898,6 +1051,25 @@ bool sgemv(const float *A,
   "vmax.f32   q0, q0, q1          @ relu\n"          \
   "vst1.32 {d0-d1}, [%[out]]      @ save result\n"
 
+#define SGEMV_OUT_4_RELU6                             \
+  /* end */                                           \
+  "4:                             @ end\n"            \
+  "vmov.i32   q1, #0              @ zero for relu6\n" \
+  "vdup.f32   q2, %[six]          @ six for relu6\n"  \
+  "vmax.f32   q0, q0, q1          @ relu6\n"          \
+  "vmin.f32   q0, q0, q2          @ relu6\n"          \
+  "vst1.32 {d0-d1}, [%[out]]      @ save result\n"
+
+#define SGEMV_OUT_4_LEAKEY_RELU                              \
+  /* end */                                                  \
+  "4:                             @ end\n"                   \
+  "vmov.i32   q1, #0              @ zero for leakey relu\n"  \
+  "vdup.f32   q2, %[alpha]        @ alpha for leakey relu\n" \
+  "vcge.f32   q3, q0, q1          @ vcgeq_f32 \n"            \
+  "vmul.f32   q4, q0, q2          @ vmulq_f32 \n"            \
+  "vbif q0,   q4, q3              @ choose \n"               \
+  "vst1.32 {d0-d1}, [%[out]]      @ save result\n"
+
 #define SGEMV_OUT_1                        \
   /* end */                                \
   "4:                             @ end\n" \
@@ -909,14 +1081,36 @@ bool sgemv(const float *A,
   "vmov.i32   d1, #0              @ zero for relu\n" \
   "vmax.f32   d0, d0, d1          @ relu\n"          \
   "vst1.32 {d0[0]}, [%[out]]      @ save result\n"
+
+#define SGEMV_OUT_1_RELU6                             \
+  /* end */                                           \
+  "4:                             @ end\n"            \
+  "vmov.i32   d1, #0              @ zero for relu6\n" \
+  "vdup.f32   d4, %[six]          @ six  for relu6\n" \
+  "vmax.f32   d0, d0, d1          @ relu6\n"          \
+  "vmin.f32   d0, d0, d4          @ relu6\n"          \
+  "vst1.32 {d0[0]}, [%[out]]      @ save result\n"
+
+#define SGEMV_OUT_1_LEAKEY_RELU                                \
+  /* end */                                                    \
+  "4:                               @ end\n"                   \
+  "vmov.i32   d2, #0                @ zero  for leakey relu\n" \
+  "vdup.f32   d3, %[alpha]          @ alpha for leakey relu\n" \
+  "vcge.f32   d6, d0, d2            @ vcgeq_f32 \n"            \
+  "vmul.f32   d8, d0, d3            @ vmulq_f32 \n"            \
+  "vbif d0,   d8, d6                @ choose \n"               \
+  "vst1.32 {d0[0]}, [%[out]]        @ save result\n"
+
 #endif
 // clang-format on
-void sgemv(const bool transA,
-           const int M,
+
+void sgemv(const int M,
            const int N,
            const float *A,
            const float *x,
-           float *y) {
+           float *y,
+           bool flag_bias,
+           const float *bias) {
   float *data_out = y;
   const float *data_in = x;
   const float *weights_ptr = A;
@@ -926,7 +1120,6 @@ void sgemv(const bool transA,
 
 #ifdef __aarch64__
   int out_cnt = M >> 3;
-
 #pragma omp parallel for
   for (int j = 0; j < out_cnt; j++) {
     int out_idx = j * 8;
@@ -940,9 +1133,22 @@ void sgemv(const bool transA,
     const float *ptr_w5 = ptr_w4 + N;
     const float *ptr_w6 = ptr_w5 + N;
     const float *ptr_w7 = ptr_w6 + N;
+    const float *bias_ptr = bias + out_idx;
+    float bias_local[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    if (flag_bias) {
+      bias_local[0] = bias_ptr[0];
+      bias_local[1] = bias_ptr[1];
+      bias_local[2] = bias_ptr[2];
+      bias_local[3] = bias_ptr[3];
+      bias_local[4] = bias_ptr[4];
+      bias_local[5] = bias_ptr[5];
+      bias_local[6] = bias_ptr[6];
+      bias_local[7] = bias_ptr[7];
+    }
     int cnt_loop = cnt;
     int tail_loop = tail;
-    asm volatile(SGEMV_IN_8 SGEMV_KERNEL_8 SGEMV_OUT_8
+    // clang-format off
+    asm volatile(SGEMV_IN_8_BIAS SGEMV_KERNEL_8 SGEMV_OUT_8
                  : [in] "+r"(ptr_in),
                    [w0] "+r"(ptr_w0),
                    [w1] "+r"(ptr_w1),
@@ -954,35 +1160,12 @@ void sgemv(const bool transA,
                    [w7] "+r"(ptr_w7),
                    [cnt] "+r"(cnt_loop),
                    [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
-                 : "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15",
-                   "v16",
-                   "v17",
-                   "v18",
-                   "v19",
-                   "v20",
-                   "v21",
-                   "v22",
-                   "v23",
-                   "v24",
-                   "v25",
-                   "cc",
-                   "memory");
+                 : [out] "r"(ptr_out), [bias_ptr] "r"(bias_local)
+                 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                   "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                   "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+                   "v24", "v25", "cc", "memory");
+    // clang-format on
   }
 //! deal with remains
 #pragma omp parallel for
@@ -992,24 +1175,17 @@ void sgemv(const bool transA,
     const float *ptr_w0 = weights_ptr + (N * j);
     int cnt_loop = cnt;
     int tail_loop = tail;
-    float tmp[4];
-    float tmp1[4];
-    float tmp2[4];
-    float tmp3[4];
-    float tmp4[4];
-    asm volatile(
-        SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1
-        : [in] "+r"(ptr_in),
-          [w0] "+r"(ptr_w0),
-          [cnt] "+r"(cnt_loop),
-          [tail] "+r"(tail_loop)
-        : [out] "r"(ptr_out),
-          [tmp] "r"(tmp),
-          [tmp1] "r"(tmp1),
-          [tmp2] "r"(tmp2),
-          [tmp3] "r"(tmp3),
-          [tmp4] "r"(tmp4)
-        : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
+    float bias0 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[j];
+    }
+    asm volatile(SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1
+                 : [in] "+r"(ptr_in),
+                   [w0] "+r"(ptr_w0),
+                   [cnt] "+r"(cnt_loop),
+                   [tail] "+r"(tail_loop)
+                 : [out] "r"(ptr_out), [bias0] "r"(bias0)
+                 : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc");
   }
 #else  // __aarch64__
   int out_cnt = M >> 2;
@@ -1022,10 +1198,20 @@ void sgemv(const bool transA,
     const float *ptr_w1 = ptr_w0 + N;
     const float *ptr_w2 = ptr_w1 + N;
     const float *ptr_w3 = ptr_w2 + N;
-
+    float bias0 = 0.f;
+    float bias1 = 0.f;
+    float bias2 = 0.f;
+    float bias3 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[out_idx];
+      bias1 = bias[out_idx + 1];
+      bias2 = bias[out_idx + 2];
+      bias3 = bias[out_idx + 3];
+    }
     int cnt_loop = cnt;
     int tail_loop = tail;
-    asm volatile(SGEMV_IN_4 SGEMV_KERNEL_4 SGEMV_OUT_4
+    // clang-format off
+    asm volatile(SGEMV_IN_4_BIAS SGEMV_KERNEL_4 SGEMV_OUT_4
                  : [in] "+r"(ptr_in),
                    [w0] "+r"(ptr_w0),
                    [w1] "+r"(ptr_w1),
@@ -1033,23 +1219,16 @@ void sgemv(const bool transA,
                    [w3] "+r"(ptr_w3),
                    [cnt] "+r"(cnt_loop),
                    [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
-                 : "q0",
-                   "q1",
-                   "q2",
-                   "q3",
-                   "q4",
-                   "q5",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "cc",
+                 : [out] "r"(ptr_out),
+                   [bias0] "r"(bias0),
+                   [bias1] "r"(bias1),
+                   [bias2] "r"(bias2),
+                   [bias3] "r"(bias3)
+                 : "q0", "q1", "q2", "q3", "q4",
+                   "q5", "q6", "q7", "q8", "q9",
+                   "q10", "q11", "q12", "q13", "cc",
                    "memory");
+    // clang-format on
   }
 //! deal with remains
 #pragma omp parallel for
@@ -1059,23 +1238,28 @@ void sgemv(const bool transA,
     const float *ptr_w0 = weights_ptr + (N * j);
     int cnt_loop = cnt;
     int tail_loop = tail;
-    asm volatile(SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1
+    float bias0 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[j];
+    }
+    asm volatile(SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1
                  : [in] "+r"(ptr_in),
                    [w0] "+r"(ptr_w0),
                    [cnt] "+r"(cnt_loop),
                    [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
+                 : [out] "r"(ptr_out), [bias0] "r"(bias0)
                  : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
   }
 #endif  // __aarch64__
 }
 
-void sgemv_relu(const bool transA,
-                const int M,
+void sgemv_relu(const int M,
                 const int N,
                 const float *A,
                 const float *x,
-                float *y) {
+                float *y,
+                bool flag_bias,
+                const float *bias) {
   float *data_out = y;
   const float *data_in = x;
   const float *weights_ptr = A;
@@ -1098,9 +1282,22 @@ void sgemv_relu(const bool transA,
     const float *ptr_w5 = ptr_w4 + N;
     const float *ptr_w6 = ptr_w5 + N;
     const float *ptr_w7 = ptr_w6 + N;
+    const float *bias_ptr = bias + out_idx;
+    float bias_local[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    if (flag_bias) {
+      bias_local[0] = bias_ptr[0];
+      bias_local[1] = bias_ptr[1];
+      bias_local[2] = bias_ptr[2];
+      bias_local[3] = bias_ptr[3];
+      bias_local[4] = bias_ptr[4];
+      bias_local[5] = bias_ptr[5];
+      bias_local[6] = bias_ptr[6];
+      bias_local[7] = bias_ptr[7];
+    }
     int cnt_loop = cnt;
     int tail_loop = tail;
-    asm volatile(SGEMV_IN_8 SGEMV_KERNEL_8 SGEMV_OUT_8_RELU
+    // clang-format off
+    asm volatile(SGEMV_IN_8_BIAS SGEMV_KERNEL_8 SGEMV_OUT_8_RELU
                  : [in] "+r"(ptr_in),
                    [w0] "+r"(ptr_w0),
                    [w1] "+r"(ptr_w1),
@@ -1112,35 +1309,12 @@ void sgemv_relu(const bool transA,
                    [w7] "+r"(ptr_w7),
                    [cnt] "+r"(cnt_loop),
                    [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
-                 : "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15",
-                   "v16",
-                   "v17",
-                   "v18",
-                   "v19",
-                   "v20",
-                   "v21",
-                   "v22",
-                   "v23",
-                   "v24",
-                   "v25",
-                   "cc",
-                   "memory");
+                 : [out] "r"(ptr_out), [bias_ptr] "r"(bias_local)
+                 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                   "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                   "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+                   "v24", "v25", "cc", "memory");
+    // clang-format on
   }
 //! deal with remains
 #pragma omp parallel for
@@ -1150,13 +1324,17 @@ void sgemv_relu(const bool transA,
     const float *ptr_w0 = weights_ptr + (N * j);
     int cnt_loop = cnt;
     int tail_loop = tail;
+    float bias0 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[j];
+    }
     asm volatile(
-        SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1_RELU
+        SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_RELU
         : [in] "+r"(ptr_in),
           [w0] "+r"(ptr_w0),
           [cnt] "+r"(cnt_loop),
           [tail] "+r"(tail_loop)
-        : [out] "r"(ptr_out)
+        : [out] "r"(ptr_out), [bias0] "r"(bias0)
         : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
   }
 #else  // __aarch64__
@@ -1170,10 +1348,20 @@ void sgemv_relu(const bool transA,
     const float *ptr_w1 = ptr_w0 + N;
     const float *ptr_w2 = ptr_w1 + N;
     const float *ptr_w3 = ptr_w2 + N;
-
+    float bias0 = 0.f;
+    float bias1 = 0.f;
+    float bias2 = 0.f;
+    float bias3 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[out_idx];
+      bias1 = bias[out_idx + 1];
+      bias2 = bias[out_idx + 2];
+      bias3 = bias[out_idx + 3];
+    }
     int cnt_loop = cnt;
     int tail_loop = tail;
-    asm volatile(SGEMV_IN_4 SGEMV_KERNEL_4 SGEMV_OUT_4_RELU
+    // clang-format off
+    asm volatile(SGEMV_IN_4_BIAS SGEMV_KERNEL_4 SGEMV_OUT_4_RELU
                  : [in] "+r"(ptr_in),
                    [w0] "+r"(ptr_w0),
                    [w1] "+r"(ptr_w1),
@@ -1181,23 +1369,16 @@ void sgemv_relu(const bool transA,
                    [w3] "+r"(ptr_w3),
                    [cnt] "+r"(cnt_loop),
                    [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
-                 : "q0",
-                   "q1",
-                   "q2",
-                   "q3",
-                   "q4",
-                   "q5",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "cc",
+                 : [out] "r"(ptr_out),
+                   [bias0] "r"(bias0),
+                   [bias1] "r"(bias1),
+                   [bias2] "r"(bias2),
+                   [bias3] "r"(bias3)
+                 : "q0", "q1", "q2", "q3", "q4",
+                   "q5", "q6", "q7", "q8", "q9",
+                   "q10", "q11", "q12", "q13", "cc",
                    "memory");
+    // clang-format on
   }
 //! deal with remains
 #pragma omp parallel for
@@ -1207,31 +1388,36 @@ void sgemv_relu(const bool transA,
     const float *ptr_w0 = weights_ptr + (N * j);
     int cnt_loop = cnt;
     int tail_loop = tail;
-    asm volatile(SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1_RELU
+    float bias0 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[j];
+    }
+    asm volatile(SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_RELU
                  : [in] "+r"(ptr_in),
                    [w0] "+r"(ptr_w0),
                    [cnt] "+r"(cnt_loop),
                    [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
+                 : [out] "r"(ptr_out), [bias0] "r"(bias0)
                  : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
   }
 #endif  // __aarch64__
 }
 
-void sgemv_bias(const bool transA,
-                const int M,
-                const int N,
-                const float *A,
-                const float *x,
-                float *y,
-                const float *bias) {
+void sgemv_relu6(const int M,
+                 const int N,
+                 const float *A,
+                 const float *x,
+                 float *y,
+                 bool flag_bias,
+                 const float *bias,
+                 const float six) {
   float *data_out = y;
   const float *data_in = x;
   const float *weights_ptr = A;
 
   int cnt = N >> 3;
   int tail = N & 7;
-
+  float32x4_t vsix = vdupq_n_f32(six);
 #ifdef __aarch64__
   int out_cnt = M >> 3;
 #pragma omp parallel for
@@ -1248,9 +1434,21 @@ void sgemv_bias(const bool transA,
     const float *ptr_w6 = ptr_w5 + N;
     const float *ptr_w7 = ptr_w6 + N;
     const float *bias_ptr = bias + out_idx;
+    float bias_local[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    if (flag_bias) {
+      bias_local[0] = bias_ptr[0];
+      bias_local[1] = bias_ptr[1];
+      bias_local[2] = bias_ptr[2];
+      bias_local[3] = bias_ptr[3];
+      bias_local[4] = bias_ptr[4];
+      bias_local[5] = bias_ptr[5];
+      bias_local[6] = bias_ptr[6];
+      bias_local[7] = bias_ptr[7];
+    }
     int cnt_loop = cnt;
     int tail_loop = tail;
-    asm volatile(SGEMV_IN_8_BIAS SGEMV_KERNEL_8 SGEMV_OUT_8
+    // clang-format off
+    asm volatile(SGEMV_IN_8_BIAS SGEMV_KERNEL_8 SGEMV_OUT_8_RELU6
                  : [in] "+r"(ptr_in),
                    [w0] "+r"(ptr_w0),
                    [w1] "+r"(ptr_w1),
@@ -1262,35 +1460,13 @@ void sgemv_bias(const bool transA,
                    [w7] "+r"(ptr_w7),
                    [cnt] "+r"(cnt_loop),
                    [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out), [bias_ptr] "r"(bias_ptr)
-                 : "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15",
-                   "v16",
-                   "v17",
-                   "v18",
-                   "v19",
-                   "v20",
-                   "v21",
-                   "v22",
-                   "v23",
-                   "v24",
-                   "v25",
-                   "cc",
-                   "memory");
+                 : [out] "r"(ptr_out), [bias_ptr] "r"(bias_local), 
+                   [vsix] "w" (vsix)
+                 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                   "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                   "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+                   "v24", "v25", "cc", "memory");
+    // clang-format on
   }
 //! deal with remains
 #pragma omp parallel for
@@ -1300,14 +1476,17 @@ void sgemv_bias(const bool transA,
     const float *ptr_w0 = weights_ptr + (N * j);
     int cnt_loop = cnt;
     int tail_loop = tail;
-    float bias0 = bias[j];
+    float bias0 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[j];
+    }
     asm volatile(
-        SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1
+        SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_RELU6
         : [in] "+r"(ptr_in),
           [w0] "+r"(ptr_w0),
           [cnt] "+r"(cnt_loop),
           [tail] "+r"(tail_loop)
-        : [out] "r"(ptr_out), [bias0] "r"(bias0)
+        : [out] "r"(ptr_out), [bias0] "r"(bias0), [six] "r"(six)
         : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
   }
 #else  // __aarch64__
@@ -1321,14 +1500,20 @@ void sgemv_bias(const bool transA,
     const float *ptr_w1 = ptr_w0 + N;
     const float *ptr_w2 = ptr_w1 + N;
     const float *ptr_w3 = ptr_w2 + N;
-    float bias0 = bias[out_idx];
-    float bias1 = bias[out_idx + 1];
-    float bias2 = bias[out_idx + 2];
-    float bias3 = bias[out_idx + 3];
-
+    float bias0 = 0.f;
+    float bias1 = 0.f;
+    float bias2 = 0.f;
+    float bias3 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[out_idx];
+      bias1 = bias[out_idx + 1];
+      bias2 = bias[out_idx + 2];
+      bias3 = bias[out_idx + 3];
+    }
     int cnt_loop = cnt;
     int tail_loop = tail;
-    asm volatile(SGEMV_IN_4_BIAS SGEMV_KERNEL_4 SGEMV_OUT_4
+    // clang-format off
+    asm volatile(SGEMV_IN_4_BIAS SGEMV_KERNEL_4 SGEMV_OUT_4_RELU6
                  : [in] "+r"(ptr_in),
                    [w0] "+r"(ptr_w0),
                    [w1] "+r"(ptr_w1),
@@ -1340,23 +1525,13 @@ void sgemv_bias(const bool transA,
                    [bias0] "r"(bias0),
                    [bias1] "r"(bias1),
                    [bias2] "r"(bias2),
-                   [bias3] "r"(bias3)
-                 : "q0",
-                   "q1",
-                   "q2",
-                   "q3",
-                   "q4",
-                   "q5",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "cc",
+                   [bias3] "r"(bias3),
+                   [six] "r" (six)
+                 : "q0", "q1", "q2", "q3", "q4",
+                   "q5", "q6", "q7", "q8", "q9",
+                   "q10", "q11", "q12", "q13", "cc",
                    "memory");
+    // clang-format on
   }
 //! deal with remains
 #pragma omp parallel for
@@ -1366,30 +1541,35 @@ void sgemv_bias(const bool transA,
     const float *ptr_w0 = weights_ptr + (N * j);
     int cnt_loop = cnt;
     int tail_loop = tail;
-    float bias0 = bias[j];
-    asm volatile(SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1
+    float bias0 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[j];
+    }
+    asm volatile(SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_RELU6
                  : [in] "+r"(ptr_in),
                    [w0] "+r"(ptr_w0),
                    [cnt] "+r"(cnt_loop),
                    [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out), [bias0] "r"(bias0)
+                 : [out] "r"(ptr_out), [bias0] "r"(bias0), [six] "r"(six)
                  : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
   }
 #endif  // __aarch64__
 }
 
-void sgemv_bias_relu(const bool transA,
-                     const int M,
-                     const int N,
-                     const float *A,
-                     const float *x,
-                     float *y,
-                     const float *bias) {
+void sgemv_leakey_relu(const int M,
+                       const int N,
+                       const float *A,
+                       const float *x,
+                       float *y,
+                       bool flag_bias,
+                       const float *bias,
+                       const float alpha) {
   float *data_out = y;
   const float *data_in = x;
   const float *weights_ptr = A;
   int cnt = N >> 3;
   int tail = N & 7;
+  float32x4_t valpha = vdupq_n_f32(alpha);
 #ifdef __aarch64__
   int out_cnt = M >> 3;
 #pragma omp parallel for
@@ -1406,9 +1586,21 @@ void sgemv_bias_relu(const bool transA,
     const float *ptr_w6 = ptr_w5 + N;
     const float *ptr_w7 = ptr_w6 + N;
     const float *bias_ptr = bias + out_idx;
+    float bias_local[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    if (flag_bias) {
+      bias_local[0] = bias_ptr[0];
+      bias_local[1] = bias_ptr[1];
+      bias_local[2] = bias_ptr[2];
+      bias_local[3] = bias_ptr[3];
+      bias_local[4] = bias_ptr[4];
+      bias_local[5] = bias_ptr[5];
+      bias_local[6] = bias_ptr[6];
+      bias_local[7] = bias_ptr[7];
+    }
     int cnt_loop = cnt;
     int tail_loop = tail;
-    asm volatile(SGEMV_IN_8_BIAS SGEMV_KERNEL_8 SGEMV_OUT_8_RELU
+    // clang-format off
+    asm volatile(SGEMV_IN_8_BIAS SGEMV_KERNEL_8 SGEMV_OUT_8_LEAKEY_RELU
                  : [in] "+r"(ptr_in),
                    [w0] "+r"(ptr_w0),
                    [w1] "+r"(ptr_w1),
@@ -1420,35 +1612,13 @@ void sgemv_bias_relu(const bool transA,
                    [w7] "+r"(ptr_w7),
                    [cnt] "+r"(cnt_loop),
                    [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out), [bias_ptr] "r"(bias_ptr)
-                 : "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15",
-                   "v16",
-                   "v17",
-                   "v18",
-                   "v19",
-                   "v20",
-                   "v21",
-                   "v22",
-                   "v23",
-                   "v24",
-                   "v25",
-                   "cc",
-                   "memory");
+                 : [out] "r"(ptr_out), [bias_ptr] "r"(bias_local), 
+                   [valpha] "w" (valpha)
+                 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                   "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                   "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+                   "v24", "v25", "cc", "memory");
+    // clang-format on
   }
 //! deal with remains
 #pragma omp parallel for
@@ -1458,14 +1628,17 @@ void sgemv_bias_relu(const bool transA,
     const float *ptr_w0 = weights_ptr + (N * j);
     int cnt_loop = cnt;
     int tail_loop = tail;
-    float bias0 = bias[j];
+    float bias0 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[j];
+    }
     asm volatile(
-        SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_RELU
+        SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_LEAKEY_RELU
         : [in] "+r"(ptr_in),
           [w0] "+r"(ptr_w0),
           [cnt] "+r"(cnt_loop),
           [tail] "+r"(tail_loop)
-        : [out] "r"(ptr_out), [bias0] "r"(bias0)
+        : [out] "r"(ptr_out), [bias0] "r"(bias0), [alpha] "r"(alpha)
         : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
   }
 #else  // __aarch64__
@@ -1479,14 +1652,20 @@ void sgemv_bias_relu(const bool transA,
     const float *ptr_w1 = ptr_w0 + N;
     const float *ptr_w2 = ptr_w1 + N;
     const float *ptr_w3 = ptr_w2 + N;
-    float bias0 = bias[out_idx];
-    float bias1 = bias[out_idx + 1];
-    float bias2 = bias[out_idx + 2];
-    float bias3 = bias[out_idx + 3];
-
+    float bias0 = 0.f;
+    float bias1 = 0.f;
+    float bias2 = 0.f;
+    float bias3 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[out_idx];
+      bias1 = bias[out_idx + 1];
+      bias2 = bias[out_idx + 2];
+      bias3 = bias[out_idx + 3];
+    }
     int cnt_loop = cnt;
     int tail_loop = tail;
-    asm volatile(SGEMV_IN_4_BIAS SGEMV_KERNEL_4 SGEMV_OUT_4_RELU
+    // clang-format off
+    asm volatile(SGEMV_IN_4_BIAS SGEMV_KERNEL_4 SGEMV_OUT_4_LEAKEY_RELU
                  : [in] "+r"(ptr_in),
                    [w0] "+r"(ptr_w0),
                    [w1] "+r"(ptr_w1),
@@ -1498,23 +1677,13 @@ void sgemv_bias_relu(const bool transA,
                    [bias0] "r"(bias0),
                    [bias1] "r"(bias1),
                    [bias2] "r"(bias2),
-                   [bias3] "r"(bias3)
-                 : "q0",
-                   "q1",
-                   "q2",
-                   "q3",
-                   "q4",
-                   "q5",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "cc",
+                   [bias3] "r"(bias3),
+                   [alpha] "r" (alpha)
+                 : "q0", "q1", "q2", "q3", "q4",
+                   "q5", "q6", "q7", "q8", "q9",
+                   "q10", "q11", "q12", "q13", "cc",
                    "memory");
+    // clang-format on
   }
 //! deal with remains
 #pragma omp parallel for
@@ -1524,14 +1693,18 @@ void sgemv_bias_relu(const bool transA,
     const float *ptr_w0 = weights_ptr + (N * j);
     int cnt_loop = cnt;
     int tail_loop = tail;
-    float bias0 = bias[j];
-    asm volatile(SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_RELU
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out), [bias0] "r"(bias0)
-                 : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
+    float bias0 = 0.f;
+    if (flag_bias) {
+      bias0 = bias[j];
+    }
+    asm volatile(
+        SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_LEAKEY_RELU
+        : [in] "+r"(ptr_in),
+          [w0] "+r"(ptr_w0),
+          [cnt] "+r"(cnt_loop),
+          [tail] "+r"(tail_loop)
+        : [out] "r"(ptr_out), [bias0] "r"(bias0), [alpha] "r"(alpha)
+        : "q0", "q1", "q3", "q4", "q12", "q13", "q14", "q15", "cc", "memory");
   }
 #endif  // __aarch64__
 }
diff --git a/lite/backends/arm/math/sgemv.h b/lite/backends/arm/math/sgemv.h
index aa17349c99e61f7135090318be829149ecd6bb57..53b2c2ab55a2cee51f8535683c5cf34340fd6dab 100644
--- a/lite/backends/arm/math/sgemv.h
+++ b/lite/backends/arm/math/sgemv.h
@@ -17,23 +17,26 @@
 #include <cmath>
 #include "lite/core/context.h"
 #include "lite/core/device_info.h"
+#include "lite/operators/op_params.h"
 
 namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
 
-// TODO(xxx): fixme now only support transA = false
-bool sgemv(const float* A,
-           const float* x,
-           float* y,
+bool sgemv(const float *A,
+           const float *x,
+           float *y,
            bool transA,
            int M,
            int N,
            bool is_bias,
-           const float* bias,
-           bool is_relu,
-           const ARMContext* ctx);
+           const float *bias,
+           bool flag_act,
+           lite_api::ActivationType act,
+           const ARMContext *ctx,
+           float six = 6.f,
+           float alpha = 1.f);
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/arm/math/slice.cc b/lite/backends/arm/math/slice.cc
index 8b9a7690509260ed4c6c0e14750d849f657d2fa8..67ca567fea988acfc9e20e2bfc929e9c3a0bbcb8 100644
--- a/lite/backends/arm/math/slice.cc
+++ b/lite/backends/arm/math/slice.cc
@@ -86,6 +86,13 @@ template void slice(const int* input,
                     std::vector<int> ends,
                     int* out,
                     Context<TARGET(kARM)>* ctx);
+template void slice(const float* input,
+                    std::vector<int64_t> dims,
+                    std::vector<int> axes,
+                    std::vector<int> starts,
+                    std::vector<int> ends,
+                    float* out,
+                    Context<TARGET(kARM)>* ctx);
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/arm/math/split.cc b/lite/backends/arm/math/split.cc
index 54ea7e62c2567cf2fe490351572968366fda483e..bff29af93b525dc18e19bded03b0770f7f7a33c8 100644
--- a/lite/backends/arm/math/split.cc
+++ b/lite/backends/arm/math/split.cc
@@ -70,10 +70,12 @@ void split<float>(const float* din,
     int in_after = in_strides[axis];
     int out_after = out_strides[axis];
 
+    const float* din_ptr = din + input_offset;
+
     for (int i = 0; i < before; ++i) {
-      split_cpy(din + input_offset + i * in_after,
-                out_data + i * out_after,
-                out_after);
+      std::memcpy(out_data, din_ptr, sizeof(float) * out_after);
+      din_ptr += in_after;
+      out_data += out_after;
     }
     input_offset += out_strides[axis];
   }
diff --git a/lite/backends/arm/math/split_merge_lod_tenosr.cc b/lite/backends/arm/math/split_merge_lod_tenosr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..35dc4a455b7c51e0aab1a45c48460ccc513b9a08
--- /dev/null
+++ b/lite/backends/arm/math/split_merge_lod_tenosr.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/split_merge_lod_tenosr.h"
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod,
+                                        size_t start_idx,
+                                        size_t end_idx,
+                                        size_t start_level) {
+  LoD sub_lod;
+  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+    CHECK(start_idx <= end_idx);
+    CHECK(end_idx < lod[level_idx].size());
+    std::vector<uint64_t> level_lens;
+    for (size_t i = start_idx; i < end_idx; ++i) {
+      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
+    }
+    sub_lod.emplace_back(level_lens);
+    start_idx = lod[level_idx][start_idx];
+    end_idx = lod[level_idx][end_idx];
+  }
+  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
+}
+
+void AppendLoD(LoD *lod, const LoD &lod_length) {
+  CHECK(lod->empty() || lod->size() == lod_length.size());
+  if (lod->empty()) {
+    for (size_t i = 0; i < lod_length.size(); ++i) {
+      lod->emplace_back(std::vector<uint64_t>({0}));
+    }
+  }
+  for (size_t i = 0; i < lod->size(); ++i) {
+    auto &level = (*lod)[i];
+    for (auto len : lod_length[i]) {
+      level.push_back(level.back() + len);
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/split_merge_lod_tenosr.h b/lite/backends/arm/math/split_merge_lod_tenosr.h
new file mode 100644
index 0000000000000000000000000000000000000000..47c484aa4a203ed1819a7e810f71858f4ef0b4dd
--- /dev/null
+++ b/lite/backends/arm/math/split_merge_lod_tenosr.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <utility>
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
+    const LoD &lod, size_t start_idx, size_t end_idx, size_t start_level);
+
+void AppendLoD(LoD *lod, const LoD &lod_length);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/type_trans.cc b/lite/backends/arm/math/type_trans.cc
index 6ded50e75294ad5145b3b88c4c341d4cce09c812..c50abb741ded487efa03d7d46baf2c6f13a8791d 100644
--- a/lite/backends/arm/math/type_trans.cc
+++ b/lite/backends/arm/math/type_trans.cc
@@ -46,6 +46,7 @@ void fp32_to_int8(const float* din,
     float inv_scale = 1.f / scale[j % axis_size];
     float32x4_t vzero = vdupq_n_f32(0.f);
     float32x4_t vscale = vdupq_n_f32(inv_scale);
+    float32x4_t vmax = vdupq_n_f32(-127.f);
     float32x4_t vpoff = vdupq_n_f32(0.5f);
     float32x4_t vnoff = vdupq_n_f32(-0.5f);
     const float* din_c = din + j * inner_size;
@@ -63,6 +64,14 @@ void fp32_to_int8(const float* din,
           "fmul v5.4s, v1.4s, %[scale].4s             \n"
           "fmul v6.4s, v2.4s, %[scale].4s             \n"
           "fmul v7.4s, v3.4s, %[scale].4s             \n"
+          "fcmge v8.4s, v4.4s, %[vmax].4s             \n"
+          "fcmge v9.4s, v5.4s, %[vmax].4s             \n"
+          "fcmge v10.4s, v6.4s, %[vmax].4s            \n"
+          "fcmge v11.4s, v7.4s, %[vmax].4s            \n"
+          "bif v4.16b, %[vmax].16b, v8.16b            \n"
+          "bif v5.16b, %[vmax].16b, v9.16b            \n"
+          "bif v6.16b, %[vmax].16b, v10.16b            \n"
+          "bif v7.16b, %[vmax].16b, v11.16b            \n"
           "ldp q0, q1, [%[in]], #32                   \n"
           "subs %[cnt], %[cnt], #1                    \n"
           "FCVTAS v8.4s, v4.4s                        \n"
@@ -79,7 +88,7 @@ void fp32_to_int8(const float* din,
           "str q8, [%[out]], #16                      \n"
           "bne    0b                                  \n"
           : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
-          : [scale] "w"(vscale)
+          : [scale] "w"(vscale), [vmax] "w"(vmax)
           : "v0",
             "v1",
             "v2",
@@ -104,15 +113,23 @@ void fp32_to_int8(const float* din,
           "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
           "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
           "vcgt.f32   q10, q2, %q[vzero]          @ get mask > 0, in2\n"
-          "vcgt.f32   q11, q3, %q[vzero]          @ get mask > 0, in3\n"
           "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
+          "vcgt.f32   q8, q3, %q[vzero]          @ get mask > 0, in3\n"
           "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
           "vbif.f32   q6, %q[vnoff], q10          @ get right offset\n"
-          "vbif.f32   q7, %q[vnoff], q11          @ get right offset\n"
+          "vbif.f32   q7, %q[vnoff], q8          @ get right offset\n"
           "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
           "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
           "vmla.f32   q6, q2, %q[vscale]          @ mul scale\n"
           "vmla.f32   q7, q3, %q[vscale]          @ mul scale\n"
+          "vcge.f32 q8, q4, %q[vmax]              @ q4 >= vmax \n"
+          "vcge.f32 q9, q5, %q[vmax]              @ q4 >= vmax \n"
+          "vcge.f32 q10, q6, %q[vmax]             @ q4 >= vmax \n"
+          "vbif q4, %q[vmax], q8                  @ choose \n"
+          "vcge.f32 q8, q7, %q[vmax]             @ q4 >= vmax \n"
+          "vbif q5, %q[vmax], q9                  @ choose \n"
+          "vbif q6, %q[vmax], q10                  @ choose \n"
+          "vbif q7, %q[vmax], q8                  @ choose \n"
           "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
           "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
           "vcvt.s32.f32  q2, q6                   @ cvt to int32\n"
@@ -133,25 +150,16 @@ void fp32_to_int8(const float* din,
           : [vscale] "w"(vscale),
             [vpoff] "w"(vpoff),
             [vnoff] "w"(vnoff),
-            [vzero] "w"(vzero)
-          : "q0",
-            "q1",
-            "q2",
-            "q3",
-            "q4",
-            "q5",
-            "q6",
-            "q7",
-            "q8",
-            "q9",
-            "q10",
-            "q11");
+            [vzero] "w"(vzero),
+            [vmax] "w"(vmax)
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10");
 #endif
     }
     const float* din_r = din_c + 16 * cnt;
     signed char* dout_r = dout_c + 16 * cnt;
     for (int i = 0; i < remain; ++i) {
       dout_r[i] = saturate_cast<int8_t>(roundf(inv_scale * din_r[i]));
+      dout_r[i] = dout_r[i] < -127 ? -127 : dout_r[i];
     }
   }
 }
diff --git a/lite/backends/bm/CMakeLists.txt b/lite/backends/bm/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9e15b9836b875cec8b5e129ad0f6aceb85ff9d33
--- /dev/null
+++ b/lite/backends/bm/CMakeLists.txt
@@ -0,0 +1,5 @@
+if (NOT LITE_WITH_BM)
+    return()
+endif()
+
+lite_cc_library(target_wrapper_bm SRCS target_wrapper.cc DEPS ${bm_runtime_libs})
diff --git a/lite/backends/bm/target_wrapper.cc b/lite/backends/bm/target_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c75c71452269167064c248418098bcb285d09055
--- /dev/null
+++ b/lite/backends/bm/target_wrapper.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/bm/target_wrapper.h"
+#include <bmcompiler_if.h>
+#include <bmlib_runtime.h>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+
+int TargetWrapperBM::device_id_ = 0;
+std::map<int, void*> TargetWrapperBM::bm_hds_;
+
+size_t TargetWrapperBM::num_devices() {
+  int count = 0;
+  bm_dev_getcount(&count);
+  return count;
+}
+
+void TargetWrapperBM::SetDevice(int id) {
+  /*
+    if (id < 0 || (size_t)id >= num_devices()) {
+      LOG(FATAL) << "Failed with invalid device id " << id;
+    }
+  */
+  device_id_ = id;
+  if (bm_hds_.find(id) == bm_hds_.end()) {
+    bm_handle_t bm_handle;
+    bm_status_t ret = bm_dev_request(&bm_handle, id);
+    CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: "
+                              << static_cast<int>(ret);
+    bm_hds_.insert(std::pair<int, bm_handle_t>(id, bm_handle));
+  }
+  return;
+}
+
+void* TargetWrapperBM::GetHandle() {
+  if (bm_hds_.find(device_id_) == bm_hds_.end()) {
+    LOG(FATAL) << "device not initialized " << device_id_;
+  }
+  return bm_hds_.at(device_id_);
+}
+
+void* TargetWrapperBM::Malloc(size_t size) {
+  void* ptr{};
+
+  if (bm_hds_.find(device_id_) == bm_hds_.end()) {
+    SetDevice(device_id_);
+  }
+
+  bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
+  bm_device_mem_t* p_mem =
+      reinterpret_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
+  bm_malloc_device_byte(bm_handle, p_mem, size);
+  ptr = reinterpret_cast<void*>(p_mem);
+  return ptr;
+}
+
+void TargetWrapperBM::Free(void* ptr) {
+  if (ptr != NULL) {
+    bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
+    bm_device_mem_t* mem = static_cast<bm_device_mem_t*>(ptr);
+    bm_free_device(bm_handle, *mem);
+    free(ptr);
+  }
+  return;
+}
+
+void TargetWrapperBM::MemcpySync(void* dst,
+                                 const void* src,
+                                 size_t size,
+                                 IoDirection dir) {
+  if (bm_hds_.find(device_id_) == bm_hds_.end()) {
+    return;
+  }
+
+  bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
+  bm_device_mem_t* pmem{};
+  const bm_device_mem_t* pcst_mem{};
+
+  switch (dir) {
+    case IoDirection::HtoD:
+      pmem = static_cast<bm_device_mem_t*>(dst);
+      bm_memcpy_s2d_partial_offset(
+          bm_handle, *pmem, const_cast<void*>(src), size, 0);
+      break;
+    case IoDirection::DtoH:
+      pcst_mem = static_cast<const bm_device_mem_t*>(src);
+      bm_memcpy_d2s_partial_offset(
+          bm_handle, reinterpret_cast<void*>(dst), *pcst_mem, size, 0);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
+      break;
+  }
+  return;
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/bm/target_wrapper.h b/lite/backends/bm/target_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..2674ffe161582fbd2fe0dfcabbe8e349d13f847f
--- /dev/null
+++ b/lite/backends/bm/target_wrapper.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+
+using TargetWrapperBM = TargetWrapper<TARGET(kBM)>;
+
+template <>
+class TargetWrapper<TARGET(kBM)> {
+ public:
+  using stream_t = int;
+  using event_t = int;
+
+  static size_t num_devices();
+  static size_t maximum_stream() { return 0; }
+
+  static void SetDevice(int id);
+  static void CreateStream(stream_t* stream) {}
+  static void DestroyStream(const stream_t& stream) {}
+
+  static void CreateEvent(event_t* event) {}
+  static void DestroyEvent(const event_t& event) {}
+
+  static void RecordEvent(const event_t& event) {}
+  static void SyncEvent(const event_t& event) {}
+
+  static void StreamSync(const stream_t& stream) {}
+
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+
+  static void* GetHandle();
+
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+
+  static void MemcpyAsync(void* dst,
+                          const void* src,
+                          size_t size,
+                          IoDirection dir,
+                          const stream_t& stream) {}
+
+  static void MemsetSync(void* devPtr, int value, size_t count) {}
+
+  static void MemsetAsync(void* devPtr,
+                          int value,
+                          size_t count,
+                          const stream_t& stream) {}
+
+ private:
+  static int device_id_;
+  static std::map<int, void*> bm_hds_;
+};
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt
index a6c3fcc66a789f159cd3a756ed893627b393e1fe..35f5f0ce2d93db59cbb856d8008e6f3138633e42 100644
--- a/lite/backends/cuda/CMakeLists.txt
+++ b/lite/backends/cuda/CMakeLists.txt
@@ -1,10 +1,9 @@
 if(NOT LITE_WITH_CUDA)
     return()
 endif()
-set(cuda_static_deps cudnn_static cublas_static curand_static
-    culibos_static cudart_static)
+get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
 
-nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_static_deps})
-nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_static_deps})
+nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps})
+nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps})
  
 add_subdirectory(math)
diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt
index 1829bcf330aba31708ac97c97d093afbda197908..fafd74ae7a43d1a769456edfe408c71593d21201 100644
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -2,8 +2,7 @@ if(NOT LITE_WITH_CUDA)
     return()
 endif()
 
-set(cuda_static_deps cudnn_static cublas_static curand_static
-    culibos_static cudart_static)
+get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
 
 nv_library(cuda_activation SRCS activation.cu DEPS ${cuda_static_deps})
 nv_library(cuda_scale SRCS scale.cu DEPS ${cuda_static_deps})
@@ -12,6 +11,7 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale
 cuda_type_trans ${cuda_static_deps})
 nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
+nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
 nv_library(cuda_gemm SRCS gemm.cc  DEPS ${cuda_static_deps})
 nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
 
@@ -23,6 +23,7 @@ set (
  cuda_type_trans
  cuda_transpose
  cuda_elementwise
+ cudnn_pool
  cuda_gemm
  cuda_batched_gemm
 )
diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc
index 72ed3951f6b9b22a5ae1ee6caef8c69708102885..5dd53084f4079ae68c6fda0530fb5de8cf1d3717 100644
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -31,6 +31,9 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
   auto o_dims = param.output->dims();
   int batch = x_dims[0];
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   int iw = x_dims[3];  // nchw
   int ih = x_dims[2];
   int ic = x_dims[1];
@@ -41,10 +44,10 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
   int kh = w_dims[2];
   int sw = param.strides[1];
   int sh = param.strides[0];
-  int pw = param.paddings[1];
-  int ph = param.paddings[0];
-  int dw = param.dilations[1];
-  int dh = param.dilations[0];
+  int pw = paddings[2];
+  int ph = paddings[0];
+  int dw = dilations[1];
+  int dh = dilations[0];
 
   CHECK(ic % param.groups == 0)
       << "The conv input channel shoud be divide group number.";
@@ -86,9 +89,15 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
         this->act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
   }
 
+#if CUDNN_VERSION_MIN(7, 0, 0)
+  cudnnMathType_t math_type =
+      use_tensor_core_ ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
+  CUDNN_CHECK(cudnnSetConvolutionMathType(this->conv_desc_, math_type));
+#endif
+
   if (ic == param.groups && ic == oc && ic != 1) {
     this->fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-  } else if (1) {
+  } else if (!param.var_length) {
     const auto* i_data = param.x->data<float>();
     const auto* w_data = param.filter->data<float>();
     auto* o_data = param.output->mutable_data<float>(TARGET(kCUDA));
@@ -133,8 +142,8 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
     this->fwd_algo_ = algo_cache.GetAlgorithm(x_dims.Vectorize(),
                                               w_dims.Vectorize(),
                                               param.strides,
-                                              param.paddings,
-                                              param.dilations,
+                                              *param.paddings,
+                                              *param.dilations,
                                               0,
                                               search_func);
 
@@ -311,12 +320,15 @@ bool CudnnConv2DInt8<Ptype_out>::create(const operators::ConvParam& param,
   int kw = w_dims[2];
   int kh = w_dims[1];
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   int sw = param.strides[1];
   int sh = param.strides[0];
-  int pw = param.paddings[1];
-  int ph = param.paddings[0];
-  int dw = param.dilations[1];
-  int dh = param.dilations[0];
+  int pw = paddings[2];
+  int ph = paddings[0];
+  int dw = dilations[1];
+  int dh = dilations[0];
 
   std::vector<float> weight_scale = param.weight_scale;
   float input_scale = param.input_scale;
diff --git a/lite/backends/cuda/math/cudnn_pool.cc b/lite/backends/cuda/math/cudnn_pool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f970fc326b29c4c226e7dc9643e416a3cf24f0eb
--- /dev/null
+++ b/lite/backends/cuda/math/cudnn_pool.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/cudnn_pool.h"
+#include "lite/backends/cuda/math/activation.h"
+#include "lite/backends/cuda/math/scale.h"
+#include "lite/backends/cuda/math/type_trans.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+inline void UpdatePadding(std::vector<int>* paddings,
+                          const bool global_pooling,
+                          const bool adaptive,
+                          const std::vector<int>& data_dims,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& ksize) {
+  if (paddings->size() == data_dims.size()) {
+    for (size_t i = 0; i < data_dims.size(); ++i) {
+      int copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    CHECK(data_dims.size() * 2 == paddings->size())
+        << "Paddings size should be the same or twice as the pooling size.";
+  }
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+inline void UpdateKsize(std::vector<int>* ksize,
+                        const std::vector<int>& data_dims) {
+  ksize->resize(static_cast<size_t>(data_dims.size()));
+  for (size_t i = 0; i < ksize->size(); ++i) {
+    *(ksize->begin() + i) = static_cast<int>(data_dims[i]);
+  }
+}
+
+template <>
+bool CudnnPool2DNHWC<PRECISION(kFloat)>::create(
+    const operators::PoolParam& param, Context<TARGET(kCUDA)>* ctx) {
+  return true;
+}
+
+template <>
+bool CudnnPool2DNHWC<PRECISION(kFloat)>::init(const operators::PoolParam& param,
+                                              Context<TARGET(kCUDA)>* ctx) {
+  this->stream_ = ctx->exec_stream();
+  CUDNN_CHECK(cudnnCreate(&this->handle_));
+  CUDNN_CHECK(cudnnSetStream(this->handle_, this->stream_));
+
+  cudnnCreateTensorDescriptor(&this->input_desc_);
+  cudnnCreateTensorDescriptor(&this->output_desc_);
+  cudnnCreatePoolingDescriptor(&this->pooling_desc_);
+
+  return create(param, ctx);
+}
+
+template <>
+bool CudnnPool2DNHWC<PRECISION(kFloat)>::run(
+    const operators::PoolParam& param) {
+  auto x_dims = param.x->dims();
+  auto o_dims = param.output->dims();
+  int batch = x_dims[0];
+  const float* in_data = param.x->data<float>();
+  float* out_data = param.output->mutable_data<float>(TARGET(kCUDA));
+
+  int ih = x_dims[1];
+  int iw = x_dims[2];  // nchw
+  int ic = x_dims[3];
+
+  int oh = o_dims[1];
+  int ow = o_dims[2];
+  int oc = o_dims[3];
+
+  std::vector<int> ksize = param.ksize;
+  std::vector<int> strides = param.strides;
+  std::vector<int> paddings = *(param.paddings.get());
+
+  std::string pooling_type = param.pooling_type;
+  bool global_pooling = param.global_pooling;
+  bool exclusive = param.exclusive;
+  bool adaptive = param.adaptive;
+
+  std::vector<int> data_dims = {ih, iw};
+  UpdatePadding(&paddings, global_pooling, adaptive, data_dims, strides, ksize);
+
+  if (data_dims.size() * 2 == paddings.size()) {
+    for (size_t i = 0; i < data_dims.size(); ++i) {
+      paddings.erase(paddings.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    UpdateKsize(&ksize, data_dims);
+  }
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->input_desc_,
+                                         CUDNN_TENSOR_NHWC,
+                                         CUDNN_DATA_FLOAT,
+                                         batch,
+                                         ic,
+                                         ih,
+                                         iw));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->output_desc_,
+                                         CUDNN_TENSOR_NHWC,
+                                         CUDNN_DATA_FLOAT,
+                                         batch,
+                                         oc,
+                                         oh,
+                                         ow));
+  cudnnPoolingMode_t mode;
+  if (pooling_type == "max") {
+    mode = CUDNN_POOLING_MAX;
+  } else {
+    mode = exclusive ? CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
+                     : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+  }
+  CUDNN_CHECK(cudnnSetPoolingNdDescriptor(this->pooling_desc_,
+                                          mode,
+                                          CUDNN_NOT_PROPAGATE_NAN,
+                                          ksize.size(),
+                                          ksize.data(),
+                                          paddings.data(),
+                                          strides.data()));
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  CUDNN_CHECK(cudnnPoolingForward(this->handle_,
+                                  this->pooling_desc_,
+                                  &alpha,
+                                  this->input_desc_,
+                                  in_data,
+                                  &beta,
+                                  this->output_desc_,
+                                  out_data));
+
+  return true;
+}
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/cudnn_pool.h b/lite/backends/cuda/math/cudnn_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..acdc695b500ab41d615cb98c9501efd729c2fe6a
--- /dev/null
+++ b/lite/backends/cuda/math/cudnn_pool.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <PrecisionType Ptype_out>
+class CudnnPool2DBase {
+ public:
+  CudnnPool2DBase()
+      : handle_(NULL),
+        input_desc_(NULL),
+        output_desc_(NULL),
+        pooling_desc_(NULL) {}
+
+  ~CudnnPool2DBase() {
+    if (handle_ != NULL) {
+      CUDNN_CHECK(cudnnDestroy(handle_));
+    }
+    if (input_desc_) {
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc_));
+    }
+    if (output_desc_) {
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(output_desc_));
+    }
+    if (pooling_desc_) {
+      cudnnDestroyPoolingDescriptor(pooling_desc_);
+    }
+  }
+
+ protected:
+  cudaStream_t stream_;
+  cudnnHandle_t handle_;
+  cudnnTensorDescriptor_t input_desc_;
+  cudnnTensorDescriptor_t output_desc_;
+  cudnnPoolingDescriptor_t pooling_desc_;
+};
+
+template <PrecisionType Ptype_out>
+class CudnnPool2DNHWC : public CudnnPool2DBase<Ptype_out> {
+ public:
+  CudnnPool2DNHWC() : CudnnPool2DBase<Ptype_out>() {}
+  virtual ~CudnnPool2DNHWC() = default;
+  virtual bool init(const operators::PoolParam& param,
+                    Context<TARGET(kCUDA)>* ctx);
+
+  virtual bool create(const operators::PoolParam& param,
+                      Context<TARGET(kCUDA)>* ctx);
+
+  virtual bool run(const operators::PoolParam& param);
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/elementwise.cu b/lite/backends/cuda/math/elementwise.cu
index 57c9ec022a6e49551fd2d56a9b2036de13bf5a2c..8f0ebd1f97a03f03b568de694b986e9540f07c55 100644
--- a/lite/backends/cuda/math/elementwise.cu
+++ b/lite/backends/cuda/math/elementwise.cu
@@ -13,13 +13,55 @@
 // limitations under the License.
 
 #include "lite/backends/cuda/math/elementwise.h"
-#include "lite/backends/cuda/math/utils.h"
 
 namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {
 
+template <typename Dtype>
+__global__ void elementwise_kernel(const size_t total,
+                                   const Dtype* x_data,
+                                   const Dtype* y_data,
+                                   Dtype* out_data,
+                                   int pre,
+                                   int n,
+                                   int post,
+                                   BinaryOperation type) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < total) {
+    int idx = tid / post % n;
+#if __CUDA_ARCH__ >= 350
+    out_data[tid] = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type);
+#else
+    out_data[tid] = binary_calc(x_data[tid], y_data[idx], type);
+#endif
+  }
+}
+
+template <typename Dtype>
+__global__ void elementwise_relu_kernel(const size_t total,
+                                        const Dtype* x_data,
+                                        const Dtype* y_data,
+                                        Dtype* out_data,
+                                        int pre,
+                                        int n,
+                                        int post,
+                                        BinaryOperation type) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < total) {
+    int idx = tid / post % n;
+    Dtype temp;
+#if __CUDA_ARCH__ >= 350
+    temp = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type);
+
+#else
+    temp = binary_calc(x_data[tid], y_data[idx], type);
+#endif
+    out_data[tid] = temp > 0 ? temp : 0;
+  }
+}
+
 template <typename Dtype>
 __global__ void elementwise_add_kernel(const size_t total,
                                        const Dtype* x_data,
@@ -76,6 +118,56 @@ __global__ void elementwise_add_nhwc4_int8_kernel(const size_t total,
   }
 }
 
+template <typename Dtype>
+void elementwise(const Dtype* x_data,
+                 const Dtype* y_data,
+                 Dtype* out_data,
+                 int pre,
+                 int n,
+                 int post,
+                 BinaryOperation type,
+                 cudaStream_t stream) {
+  int num = pre * n * post;
+  int thread = 256;
+  int block = (num + thread - 1) / thread;
+  elementwise_kernel<<<block, thread, 0, stream>>>(
+      num, x_data, y_data, out_data, pre, n, post, type);
+}
+
+template <typename Dtype>
+void elementwise_relu(const Dtype* x_data,
+                      const Dtype* y_data,
+                      Dtype* out_data,
+                      int pre,
+                      int n,
+                      int post,
+                      BinaryOperation type,
+                      cudaStream_t stream) {
+  int num = pre * n * post;
+  int thread = 256;
+  int block = (num + thread - 1) / thread;
+  elementwise_relu_kernel<<<block, thread, 0, stream>>>(
+      num, x_data, y_data, out_data, pre, n, post, type);
+}
+
+template void elementwise(const float*,
+                          const float*,
+                          float*,
+                          int,
+                          int,
+                          int,
+                          BinaryOperation,
+                          cudaStream_t);
+
+template void elementwise_relu(const float*,
+                               const float*,
+                               float*,
+                               int,
+                               int,
+                               int,
+                               BinaryOperation,
+                               cudaStream_t);
+
 template <typename Dtype>
 void elementwise_add(int num,
                      const Dtype* x_data,
diff --git a/lite/backends/cuda/math/elementwise.h b/lite/backends/cuda/math/elementwise.h
index 7fcdf95021ff21379bf94298ed06328dd6d2db09..ce45d0544e5a55a9cdc34bdfacc2b48157f5a198 100644
--- a/lite/backends/cuda/math/elementwise.h
+++ b/lite/backends/cuda/math/elementwise.h
@@ -15,12 +15,33 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include "lite/backends/cuda/math/utils.h"
 
 namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {
 
+template <typename Dtype>
+void elementwise(const Dtype* x_data,
+                 const Dtype* y_data,
+                 Dtype* out_data,
+                 int pre,
+                 int n,
+                 int post,
+                 BinaryOperation type,
+                 cudaStream_t stream);
+
+template <typename Dtype>
+void elementwise_relu(const Dtype* x_data,
+                      const Dtype* y_data,
+                      Dtype* out_data,
+                      int pre,
+                      int n,
+                      int post,
+                      BinaryOperation type,
+                      cudaStream_t stream);
+
 template <typename Dtype>
 void elementwise_add(int num,
                      const Dtype* x_data,
diff --git a/lite/backends/cuda/math/gemm.h b/lite/backends/cuda/math/gemm.h
index 12194d54b08a533a3812e10b5d2f78134c19da24..85576e65018a0e1bdec6f2bd2fdc590bd35e9656 100644
--- a/lite/backends/cuda/math/gemm.h
+++ b/lite/backends/cuda/math/gemm.h
@@ -55,6 +55,8 @@ class Gemm {
            PtypeOut* c,
            Context<TARGET(kCUDA)>* ctx);
 
+  cublasHandle_t get_handle() const { return cu_handle_; }
+
  private:
   cudaStream_t exe_stream_;
   cublasHandle_t cu_handle_;
diff --git a/lite/backends/cuda/math/transpose.cu b/lite/backends/cuda/math/transpose.cu
index cebcece812dc584d0921edea2fef8f129e430b56..c50840fe269657965db8c58b171fce6819009775 100644
--- a/lite/backends/cuda/math/transpose.cu
+++ b/lite/backends/cuda/math/transpose.cu
@@ -69,44 +69,16 @@ void BatchTranspose2DCUDAImpl(const int N,
                               const int W,
                               const T* input,
                               T* out,
-                              CUDAContext* ctx) {
+                              cudaStream_t* stream) {
   const int dh = (H + kTileDim - 1) / kTileDim;
   const int dw = (W + kTileDim - 1) / kTileDim;
   BatchTranspose2DCUDAKernel<
-      T><<<N * dh * dw, dim3(kTileDim, kBlockRows), 0, ctx->exec_stream()>>>(
+      T><<<N * dh * dw, dim3(kTileDim, kBlockRows), 0, *stream>>>(
       N, H, W, dh, dw, input, out);
   cudaError_t error = cudaGetLastError();
   if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
 
-#define TYPE_SPECIALIZED_CUDA_NCHW2NHWC(T)             \
-  template <>                                          \
-  void NCHW2NHWC<T>(const int N,                       \
-                    const int C,                       \
-                    const int HxW,                     \
-                    const T* X,                        \
-                    T* Y,                              \
-                    CUDAContext* ctx) {                \
-    BatchTranspose2DCUDAImpl<T>(N, C, HxW, X, Y, ctx); \
-  }
-TYPE_SPECIALIZED_CUDA_NCHW2NHWC(float)
-TYPE_SPECIALIZED_CUDA_NCHW2NHWC(int8_t)
-#undef TYPE_SPECIALIZED_CUDA_NCHW2NHWC
-
-#define TYPE_SPECIALIZED_CUDA_NHWC2NCHW(T)             \
-  template <>                                          \
-  void NHWC2NCHW<T>(const int N,                       \
-                    const int C,                       \
-                    const int HxW,                     \
-                    const T* X,                        \
-                    T* Y,                              \
-                    CUDAContext* ctx) {                \
-    BatchTranspose2DCUDAImpl<T>(N, HxW, C, X, Y, ctx); \
-  }
-TYPE_SPECIALIZED_CUDA_NHWC2NCHW(float)
-TYPE_SPECIALIZED_CUDA_NHWC2NCHW(int8_t)
-#undef TYPE_SPECIALIZED_CUDA_NHWC2NCHW
-
 template <typename T>
 __global__ void TransposeCUDAKernel(const int size,
                                     const int ndim,
@@ -136,7 +108,9 @@ void TransposeCUDAImpl(const std::vector<int64_t>& X_dims,
                        const std::vector<int>& axes,
                        const T* X,
                        T* Y,
-                       CUDAContext* ctx) {
+                       lite::Tensor* Y_dims_,
+                       lite::Tensor* strides_,
+                       cudaStream_t* stream) {
   CHECK_EQ(X_dims.size(), axes.size()) << "dimension size should be equal";
   int ndim = X_dims.size();
   std::vector<int> strides(ndim, 0);
@@ -156,37 +130,68 @@ void TransposeCUDAImpl(const std::vector<int64_t>& X_dims,
     size *= X_dims[i];
   }
 
-  lite::Tensor Y_dims_, strides_;
-  Y_dims_.Resize(std::vector<int64_t>({ndim}));
-  int* d_y_dims = Y_dims_.mutable_data<int>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(
-      d_y_dims, Y_dims.data(), sizeof(int) * Y_dims.size(), IoDirection::HtoD);
+  Y_dims_->Resize(std::vector<int64_t>({ndim}));
+  int* d_y_dims = Y_dims_->mutable_data<int>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(d_y_dims,
+                                 Y_dims.data(),
+                                 sizeof(int) * Y_dims.size(),
+                                 IoDirection::HtoD,
+                                 *stream);
 
-  strides_.Resize(std::vector<int64_t>({ndim}));
-  int* d_strides = strides_.mutable_data<int>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(d_strides,
-                          strides.data(),
-                          sizeof(int) * strides.size(),
-                          IoDirection::HtoD);
+  strides_->Resize(std::vector<int64_t>({ndim}));
+  int* d_strides = strides_->mutable_data<int>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(d_strides,
+                                 strides.data(),
+                                 sizeof(int) * strides.size(),
+                                 IoDirection::HtoD,
+                                 *stream);
 
   const int M = (size + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-  TransposeCUDAKernel<<<M, CUDA_NUM_THREADS, 0, ctx->exec_stream()>>>(
+  TransposeCUDAKernel<<<M, CUDA_NUM_THREADS, 0, *stream>>>(
       size, ndim, d_strides, d_y_dims, X, Y);
   auto e = cudaGetLastError();
   CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
 }
 
-#define TYPE_SPECIALIZED_CUDA_TRANSPOSE(T)              \
-  template <>                                           \
-  void Transpose<T>(const std::vector<int64_t>& X_dims, \
-                    const std::vector<int>& axes,       \
-                    const T* X,                         \
-                    T* Y,                               \
-                    CUDAContext* ctx) {                 \
-    TransposeCUDAImpl<T>(X_dims, axes, X, Y, ctx);      \
-  }
-TYPE_SPECIALIZED_CUDA_TRANSPOSE(float)
-#undef TYPE_SPECIALIZED_CUDA_TRANSPOSEF
+template <typename T>
+void Transpose<T>::NCHW2NHWC(
+    int N, int C, int HxW, const T* X, T* Y, cudaStream_t* stream) {
+  BatchTranspose2DCUDAImpl<T>(N, C, HxW, X, Y, stream);
+}
+
+template <typename T>
+void Transpose<T>::NHWC2NCHW(
+    int N, int C, int HxW, const T* X, T* Y, cudaStream_t* stream) {
+  BatchTranspose2DCUDAImpl<T>(N, HxW, C, X, Y, stream);
+}
+
+template <typename T>
+void Transpose<T>::transpose(T* dst,
+                             const T* src,
+                             const std::vector<int64_t>& src_dims,
+                             const std::vector<int>& axes,
+                             cudaStream_t* stream) {
+  TransposeCUDAImpl<T>(src_dims, axes, src, dst, &Y_dims_, &strides_, stream);
+}
+
+// template <typename T>
+// void Transpose<T>::transpose(T* dst,
+//                             const T* src,
+//                             const std::vector<int>& src_dims,
+//                             const std::vector<int>& axes,
+//                             cudaStream_t* stream) {
+//  std::vector<int64_t> _src_dims(src_dims.size(), 0);
+//  std::transform(
+//      src_dims.begin(),
+//      src_dims.end(),
+//      _src_dims.begin(),
+//      [](int data) -> int64_t { return static_cast<int64_t>(data); });
+//  TransposeCUDAImpl<T>(_src_dims, axes, src, dst, &Y_dims_, &strides_,
+//  stream);
+//}
+
+template class Transpose<int8_t>;
+template class Transpose<float>;
 
 }  // namespace math
 }  // namespace cuda
diff --git a/lite/backends/cuda/math/transpose.h b/lite/backends/cuda/math/transpose.h
index ba2464547b587f44cd9b0ce287a0d40d37d46411..ed52ba3b5590ab631c3c57a0472e16cb0ed51a91 100644
--- a/lite/backends/cuda/math/transpose.h
+++ b/lite/backends/cuda/math/transpose.h
@@ -26,17 +26,27 @@ namespace cuda {
 namespace math {
 
 template <typename T>
-void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y, CUDAContext* context);
+class Transpose {
+ public:
+  void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y, cudaStream_t* stream);
 
-template <typename T>
-void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y, CUDAContext* context);
+  void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y, cudaStream_t* stream);
 
-template <typename T>
-void Transpose(const std::vector<int64_t>& X_dims,
-               const std::vector<int>& axes,
-               const T* X,
-               T* Y,
-               CUDAContext* ctx);
+  void transpose(T* dst,
+                 const T* src,
+                 const std::vector<int64_t>& src_dims,
+                 const std::vector<int>& axes,
+                 cudaStream_t* stream);
+
+  // void transpose(T* dst,
+  //               const T* src,
+  //               const std::vector<int>& src_dims,
+  //               const std::vector<int>& axes,
+  //               cudaStream_t* stream);
+
+ private:
+  lite::Tensor Y_dims_, strides_;  // for transpose.
+};
 
 }  // namespace math
 }  // namespace cuda
diff --git a/lite/backends/cuda/math/utils.h b/lite/backends/cuda/math/utils.h
index b4cd82fd8df6df063d92df709311f3c90e7cf4b6..b6aa9c7d160ad6c8b60b132e4a2bbd7ae1e0b9ff 100644
--- a/lite/backends/cuda/math/utils.h
+++ b/lite/backends/cuda/math/utils.h
@@ -25,6 +25,24 @@ namespace lite {
 namespace cuda {
 namespace math {
 
+enum class BinaryOperation {
+  kADD = 0,
+  kMUL = 1,
+  kDIV = 2,
+};
+
+template <typename T>
+__device__ T binary_calc(T x, T y, BinaryOperation type);
+
+template <>
+__device__ __forceinline__ float binary_calc(float x,
+                                             float y,
+                                             BinaryOperation type) {
+  if (type == BinaryOperation::kADD) return x + y;
+  if (type == BinaryOperation::kMUL) return x * y;
+  if (type == BinaryOperation::kDIV) return x / y;
+}
+
 template <typename T>
 __device__ T from_float(float x);
 
diff --git a/lite/backends/fpga/CMakeLists.txt b/lite/backends/fpga/CMakeLists.txt
index b12fd85caf7e0c79de830b45569e02ba916c34e6..a5207c01a4d5e7b8d05490bd7c9be0dcc01f365e 100644
--- a/lite/backends/fpga/CMakeLists.txt
+++ b/lite/backends/fpga/CMakeLists.txt
@@ -3,13 +3,35 @@ if (NOT LITE_WITH_FPGA)
 endif()
 
 set(LITE_FPGA_KD_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD")
+set(LITE_FPGA_KD_LLAPI_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/llapi")
+set(LITE_FPGA_KD_PE_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/pes")
 set(LITE_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga")
 
 message("fpga_kd_path ${LITE_FPGA_KD_PATH}")
 message("fpga_path ${LITE_FPGA_PATH}")
-file(GLOB_RECURSE KD_CPP *.cpp *.cc)
+file(GLOB KD_CPP "${LITE_FPGA_KD_PATH}/*.cpp")
+file(GLOB PE_CPP "${LITE_FPGA_KD_PE_PATH}/*.cpp")
+file(GLOB LLAPI_CPP "${LITE_FPGA_KD_LLAPI_PATH}/*.cpp")
 file(GLOB FPGA_CPP "${LITE_FPGA_PATH}/*.cc")
-
-cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP})
+set(FPGA_ALL_CPP "")
+FOREACH(FILE_PATH ${KD_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND FPGA_ALL_CPP KD/${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+FOREACH(FILE_PATH ${PE_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND FPGA_ALL_CPP KD/pes/${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+FOREACH(FILE_PATH ${LLAPI_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND FPGA_ALL_CPP KD/llapi/${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+FOREACH(FILE_PATH ${FPGA_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list( APPEND FPGA_ALL_CPP ${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+message("fpga kd: ${FPGA_ALL_CPP}")
+cc_library(kernel_fpga SRCS ${FPGA_ALL_CPP})
+#cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP})
 cc_library(lite_tensor_fpga SRCS lite_tensor.cc DEPS memory)
-cc_library(fpga_target_wrapper SRCS ${LITE_FPGA_PATH}/target_wrapper.cc DEPS kernel_fpga)
+cc_library(fpga_target_wrapper SRCS target_wrapper.cc DEPS kernel_fpga)
diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..9b1189c407d6d601bb3e5ba8172b1455f04710fd
--- /dev/null
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -0,0 +1,152 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+#define FPGA_PRINT_TENSOR
+
+class Debugger {
+ public:
+  static Debugger& get_instance() {
+    static Debugger s_instance;
+    return s_instance;
+  }
+
+  void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
+    if (op_config[op_type]) {
+      tensor->saveToFile(op_type, true);
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, bool> op_config;
+  Debugger() {
+    op_config["concat"] = true;
+    op_config["pooling"] = true;
+    op_config["conv"] = true;
+    op_config["dwconv"] = true;
+    op_config["ew_add"] = true;
+    op_config["crop"] = true;
+    op_config["feed"] = true;
+    op_config["mul"] = true;
+    op_config["fetch"] = true;
+    op_config["boxes"] = true;
+    op_config["scores"] = true;
+    op_config["nms"] = true;
+    op_config["pb_boxes"] = true;
+    op_config["pb_variances"] = true;
+    // op_config["fc"] = true;
+    op_config["softmax"] = true;
+  }
+};
+
+inline void chw_to_hwc(Tensor* t, float* dst) {
+  int num = t->dims()[0];
+  int channel = t->dims()[1];
+
+  int height = 1;
+  int width = 1;
+  if (t->dims().size() > 2) {
+    height = t->dims()[2];
+  }
+  if (t->dims().size() > 3) {
+    width = t->dims()[3];
+  }
+  const float* chw_data = t->data<float>();
+  float* hwc_data = dst;
+
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        for (int w = 0; w < width; w++) {
+          hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+
+inline void read_from_file(lite::Tensor* t, const std::string& path) {
+  std::ifstream file_stream;
+  file_stream.open(path);
+  if (!file_stream) {
+    return;
+  }
+  float* data = t->mutable_data<float>();
+  int num = t->numel();
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+}
+
+inline void save_float(float* data, const std::string& name, int len) {
+  static int counter = 0;
+  std::string old_string = std::to_string(counter);
+  std::string new_string =
+      std::string(3 - old_string.length(), '0') + old_string;
+
+  std::string file = "arm_" + new_string + name;
+  counter++;
+
+  std::ofstream ofs;
+  ofs.open(file);
+  for (int i = 0; i < len; i++) {
+    float value = data[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
+
+inline void save_tensor(lite::Tensor* t,
+                        const std::string& name,
+                        bool convert = true) {
+  float* data = const_cast<float*>(t->data<float>());
+  float* dst = new float[t->numel()];
+  if (convert) {
+    chw_to_hwc(t, dst);
+    data = dst;
+  }
+
+  save_float(data, name, t->numel());
+  delete[] dst;
+}
+
+inline void save_tensor(const lite::Tensor* t,
+                        const std::string& name,
+                        bool convert = true) {
+  float* data = const_cast<float*>(t->data<float>());
+  float* dst = new float[t->numel()];
+  if (convert) {
+    chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
+    data = dst;
+  }
+  save_float(data, name, t->numel());
+  delete[] dst;
+}
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/dl_engine.cpp b/lite/backends/fpga/KD/dl_engine.cpp
old mode 100644
new mode 100755
index 9849e4275b5d0f59346b9684530610853f1a560c..ea503518a0f39671e77157f14788a1cadb4579f3
--- a/lite/backends/fpga/KD/dl_engine.cpp
+++ b/lite/backends/fpga/KD/dl_engine.cpp
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "lite/backends/fpga/KD/dl_engine.hpp"
+
 namespace paddle {
 namespace zynqmp {
 
 DLEngine::DLEngine() {
   open_device();
-  struct DeviceInfo info;
-  int ret = get_device_info(info);
-  filter::set_filter_capacity(info.filter_cap);
+  int ret = get_device_info(info_);
+  filter::set_filter_capacity(info_.filter_cap);
+  filter::set_colunm(info_.colunm);
 }
 
 }  // namespace zynqmp
diff --git a/lite/backends/fpga/KD/dl_engine.hpp b/lite/backends/fpga/KD/dl_engine.hpp
old mode 100644
new mode 100755
index 829f41dfebfabfe5642bd4cf107fc6c54f3ffd86..eddf5ca454cdc9e91f87d6e4f2c8dfc13f35fdc6
--- a/lite/backends/fpga/KD/dl_engine.hpp
+++ b/lite/backends/fpga/KD/dl_engine.hpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <stdio.h>
-
 #include "lite/backends/fpga/KD/llapi/filter.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 
@@ -29,8 +28,15 @@ class DLEngine {
     return s_instance;
   }
 
+  DeviceInfo& deviceInfo();
+
+  bool isZU3() { return info_.device_type / 100 == 3; }
+
+  float* out_data = nullptr;
+
  private:
   DLEngine();
+  DeviceInfo info_;
 };
 }  // namespace zynqmp
 }  // namespace paddle
diff --git a/lite/backends/fpga/KD/layout.hpp b/lite/backends/fpga/KD/layout.hpp
index 74819cd2120630def0114422b04efe076e1d6cb2..c6b5c911872b6b22633a4319ea708ed23c7e7e36 100644
--- a/lite/backends/fpga/KD/layout.hpp
+++ b/lite/backends/fpga/KD/layout.hpp
@@ -22,6 +22,7 @@ namespace paddle {
 namespace zynqmp {
 
 enum LayoutType {
+  None,
   N,
   NC,
   NCHW,
@@ -39,6 +40,15 @@ class Layout {
   virtual int elementCount(const std::vector<int>& dims) = 0;
 };
 
+struct None : Layout {
+  int numIndex() { return -1; }
+  int channelIndex() { return -1; }
+  int heightIndex() { return -1; }
+  int widthIndex() { return -1; }
+  int alignedElementCount(const std::vector<int>& dims) { return 16; }
+  virtual int elementCount(const std::vector<int>& dims) { return 1; }
+};
+
 struct NCHW : Layout {
   int numIndex() { return 0; }
   int channelIndex() { return 1; }
diff --git a/lite/backends/fpga/KD/llapi/bias_scale.cpp b/lite/backends/fpga/KD/llapi/bias_scale.cpp
index cd60f27f9896e857f8ad566d285a9b9aea1d4721..339a442207e811be31161ff25f60a080572efe8d 100644
--- a/lite/backends/fpga/KD/llapi/bias_scale.cpp
+++ b/lite/backends/fpga/KD/llapi/bias_scale.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory.h>
 
+#include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/llapi/bias_scale.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 
@@ -54,7 +55,7 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
   *data_in = ptr_aligned;
 }
 
-void interleave(float **data_in, int num_after_alignment) {
+size_t interleave(float **data_in, int num_after_alignment) {
   float *ptr_uninterleaved = *data_in;
   float *ptr_interleaved =
       (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
@@ -69,6 +70,7 @@ void interleave(float **data_in, int num_after_alignment) {
 
   fpga_free(ptr_uninterleaved);
   *data_in = ptr_interleaved;
+  return 2 * num_after_alignment * sizeof(float);
 }
 
 void format_bias_scale_array(float **bias_scale_array,
@@ -78,8 +80,9 @@ void format_bias_scale_array(float **bias_scale_array,
   int div_num = (num + element_num_per_division - 1) / element_num_per_division;
   int element_num_after_division =
       align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
-  interleave(bias_scale_array, div_num * element_num_after_division);
-  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
+  size_t mem =
+      interleave(bias_scale_array, div_num * element_num_after_division);
+  fpga_flush(*bias_scale_array, mem);
 }
 void format_bias_array(float **bias_array, int num) {
   float *ptr_unaligned = *bias_array;
diff --git a/lite/backends/fpga/KD/llapi/bias_scale.h b/lite/backends/fpga/KD/llapi/bias_scale.h
index 83f30df18fc7e5967d727ed8ce275d63e1cb29e0..d47d082ccdc6b41cf43860495e43076c17b13ac3 100644
--- a/lite/backends/fpga/KD/llapi/bias_scale.h
+++ b/lite/backends/fpga/KD/llapi/bias_scale.h
@@ -19,7 +19,7 @@ namespace zynqmp {
 namespace bias_scale {
 
 void align_element(float** data_in, int num_per_div_before_alignment, int num);
-void interleave(float** data_in, int num_after_alignment);
+size_t interleave(float** data_in, int num_after_alignment);
 void format_bias_scale_array(float** bias_scale_array,
                              int element_num_per_division,
                              int num);
diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp
old mode 100644
new mode 100755
index 0e41a204a854b0b57e1a8c98fb3cc8d5224c807c..da81565cf5ca152a54b6cc1514cb660589428439
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "lite/backends/fpga/KD/llapi/filter.h"
 #include <memory.h>
 #include <algorithm>
+#include <fstream>
+#include <string>
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 
@@ -23,11 +25,41 @@ namespace zynqmp {
 namespace filter {
 
 static int FILTER_SIZE = 2048;
+static int COLUMN = 4;
+
+void saveToFile(std::string name, void* data_in, int size) {
+  std::ofstream ofs;
+  ofs.open(name);
+
+  int8_t* data = static_cast<int8_t*>(data_in);
+  for (int i = 0; i < size; i++) {
+    float value = data[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
+
+void saveFloatToFile(std::string name, float* data_in, int size) {
+  std::ofstream ofs;
+  ofs.open(name);
+
+  for (int i = 0; i < size; i++) {
+    float value = data_in[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
 
 void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
 
+void set_colunm(uint32_t column) { COLUMN = column; }
+
+// replace zynqmp_api.h  #define FILTER_NUM_ALIGNMENT
+int get_filter_num_alignment() { return COLUMN * 4; }
+
 int calc_division_capacity(int chw) {
-  int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
+  int filter_num_alignment = get_filter_num_alignment();
+  int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment;
   return n < FILTER_SIZE ? n : FILTER_SIZE;
 }
 
@@ -52,28 +84,36 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
   }
 }
 
-void convert_to_hwc(
-    char **data_in, int num, int channel, int height, int width) {
-  char *tmp = *data_in;
+int calc_pack_num(int num_per_group, int group, int division_capacity) {
+  auto n = 1;
+  while ((num_per_group * (group + n - 1) / n) > division_capacity) {
+    n++;
+  }
+  return (n);
+}
+
+void convert_to_hwc(int8_t* chw_data,
+                    int8_t* hwc_data,
+                    int num,
+                    int channel,
+                    int height,
+                    int width) {
   int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
+  int wc = width * channel;
+  int index = 0;
   for (int n = 0; n < num; n++) {
-    int64_t amount_per_row = width * channel;
     for (int c = 0; c < channel; c++) {
       for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
         for (int w = 0; w < width; w++) {
-          *(data_tmp + n * chw + offset_height + w * channel + c) =
-              *((*data_in)++);
+          hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
+          index++;
         }
       }
     }
   }
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
 
-float find_max(float *data_in, int data_size) {
+float find_max(float* data_in, int data_size) {
   float max = 0.0;
   for (int i = 0; i < data_size; ++i) {
     float value = data_in[i];
@@ -83,166 +123,178 @@ float find_max(float *data_in, int data_size) {
   return max;
 }
 
-signed char float_to_int8(float fdata) {
+int8_t float_to_int8(float fdata) {
   if (fdata < 0.0) {
     fdata -= 0.5;
   } else {
     fdata += 0.5;
   }
-  return (signed char)fdata;
+  return (int8_t)fdata;
 }
 
-void quantize(float **data_in, int data_size, float max) {
-  float *tmp = *data_in;
+void quantize(float* src, int8_t* dst, int len, float max) {
   float fix_range = 127;
   float scale = fix_range / max;
-
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
+  for (size_t i = 0; i < len; i++) {
+    dst[i] = float_to_int8(src[i] * scale);
   }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
 }
 
-void align_element(char **data_in, int num, int chw) {
-  int j = 0;
+bool should_align_chw(int chw) {
   int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  if (align_chw != chw) {
-    char *tmp = *data_in;
-    char *data_tmp =
-        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
-
-    memset(data_tmp, 0, num * align_chw);
-    for (j = 0; j < num; j++) {
-      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
+  return align_chw != chw;
+}
+
+void align_chw(int8_t* src, int8_t* dst, int num, int chw) {
+  int aligned_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  memset(dst, 0, num * aligned_chw);
+  for (int j = 0; j < num; j++) {
+    memcpy((dst + j * aligned_chw), (src + j * chw), chw);
   }
 }
 
-void align_num(char **data_in,
+void align_num(int8_t* src,
+               int8_t* dst,
                int num_per_div_before_alignment,
                int num,
-               int chw) {
-  int i = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+               int align_chw) {
+  int filter_num_alignment = get_filter_num_alignment();
   int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+      align_to_x(num_per_div_before_alignment, filter_num_alignment);
 
-  char *tmp = *data_in;
   int div_num =
       (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
   int num_element = div_num * num_per_div_after_alignment * align_chw;
-  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-
-  memset(data_tmp, 0, num_element * sizeof(char));
 
+  memset(dst, 0, num_element * sizeof(int8_t));
+  int i = 0;
   for (i = 0; i < div_num - 1; i++) {
-    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-           *data_in + num_per_div_before_alignment * align_chw * i,
+    memcpy(dst + num_per_div_after_alignment * align_chw * i,
+           src + num_per_div_before_alignment * align_chw * i,
            num_per_div_before_alignment * align_chw);
   }
 
-  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-         *data_in + num_per_div_before_alignment * align_chw * i,
+  memcpy(dst + num_per_div_after_alignment * align_chw * i,
+         src + num_per_div_before_alignment * align_chw * i,
          (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
 
-void reorder(char **data_in, int num_after_alignment, int chw) {
+void reorder(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
   int index = 0;
   int new_index = 0;
-
+  int filter_num_alignment = get_filter_num_alignment();
   int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
   for (index = 0; index < num_after_alignment; index++) {
-    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
-                (index / 16 % 2 * 4);
-    memcpy(data_tmp + index * chw_align,
-           *data_in + new_index * chw_align,
-           chw_align);
+    new_index = index / filter_num_alignment * filter_num_alignment +
+                (index % (filter_num_alignment / 2) / 4 * 8) +
+                (index % (filter_num_alignment / 2) % 4) +
+                (index / (filter_num_alignment / 2) % 2 * 4);
+    memcpy((dst + index * chw_align), (src + new_index * chw_align), chw_align);
   }
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
 
-size_t interleave(char **data_in, int num_after_alignment, int chw) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
+void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
   int interleave_per_num = 16;
-
   int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
   int interleave_num = chw_align * 2 / interleave_per_num;
-  for (i = 0; i < num_after_alignment; i += 2) {
-    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
-      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
-             *data_in + i * chw_align + interleave_per_num * k,
+  for (int i = 0; i < num_after_alignment; i += 2) {
+    for (int j = 0, k = 0; j < interleave_num; j += 2, k++) {
+      memcpy(dst + i * chw_align + interleave_per_num * j,
+             src + i * chw_align + interleave_per_num * k,
              interleave_per_num);
-      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
-             *data_in + (i + 1) * chw_align + interleave_per_num * k,
+      memcpy(dst + i * chw_align + interleave_per_num * (j + 1),
+             src + (i + 1) * chw_align + interleave_per_num * k,
              interleave_per_num);
     }
   }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-  return chw_align * num_after_alignment;
 }
 
-size_t format_filter(float **data_in,
-                     int num,
-                     int channel,
-                     int height,
-                     int width,
-                     int group_num,
-                     float max) {
+int8_t* format_filter(float* data_in,
+                      int& mem_size_a,  // NOLINT
+                      int num,
+                      int channel,
+                      int height,
+                      int width,
+                      int group_num,
+                      float max,
+                      std::vector<float>& filter_max) {  // NOLINT
   int data_size = channel * height * width * num;
   int chw = channel * height * width;
 
   int division_capacity = calc_division_capacity(chw);
+  int filter_num_alignment = get_filter_num_alignment();
   int num_per_div_before_alignment =
       calc_num_per_div(num, group_num, division_capacity);
   int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+      align_to_x(num_per_div_before_alignment, filter_num_alignment);
   int div_num =
       (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
   int residual = num % num_per_div_before_alignment;
   int num_after_alignment = num_per_div_after_alignment *
                                 ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
+                            align_to_x(residual, filter_num_alignment);
+
+  int8_t* quantized_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
+
+  for (int n = 0; n < num; n++) {
+    float* filter_start = data_in + n * chw;
+    int8_t* quantized_start = quantized_data + n * chw;
+    quantize(filter_start, quantized_start, chw, max);
+    filter_max.push_back(1);
   }
 
-  reorder(quantize_data, num_after_alignment, chw);
-  size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data,
+  int8_t* hwc_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
+  convert_to_hwc(quantized_data, hwc_data, num, channel, height, width);
+  fpga_free(quantized_data);
+
+  int8_t* temp_data = hwc_data;  // NOLINT
+  int chw_aligned = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  if (should_align_chw(chw)) {
+    int8_t* hwc_aligned_data = reinterpret_cast<int8_t*>(
+        fpga_malloc(num * chw_aligned * sizeof(int8_t)));
+    align_chw(hwc_data, hwc_aligned_data, num, chw);
+
+    temp_data = hwc_aligned_data;
+    fpga_free(hwc_data);
+  }
+  if (num_after_alignment != num) {
+    int filter_num_alignment = get_filter_num_alignment();
+    int num_per_div_after_alignment =
+        align_to_x(num_per_div_before_alignment, filter_num_alignment);
+
+    int num_element = div_num * num_per_div_after_alignment * chw_aligned;
+    int8_t* num_aligned_data =
+        reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
+    align_num(temp_data,
+              num_aligned_data,
+              num_per_div_before_alignment,
+              num,
+              chw_aligned);
+
+    fpga_free(temp_data);
+    temp_data = num_aligned_data;
+  }
+  int8_t* aligned_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
+  reorder(temp_data, aligned_data, num_after_alignment, chw);
+  fpga_free(temp_data);
+  int8_t* interleaved_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
+  interleave(aligned_data, interleaved_data, num_after_alignment, chw);
+  fpga_free(aligned_data);
+  fpga_flush(interleaved_data,
              align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *
                  sizeof(char));
-  return mem_size;
+  mem_size_a = num_after_alignment * chw_aligned;
+  return interleaved_data;
 }
 
-void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
-  int16_t *tmp = *data_in;
-  int16_t *data_tmp =
-      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
+void convert_to_hwn(int16_t** data_in, int num, int height, int width) {
+  int16_t* tmp = *data_in;
+  int16_t* data_tmp =
+      (int16_t*)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
   for (int n = 0; n < num; n++) {
     for (int h = 0; h < height; h++) {
       for (int w = 0; w < width; w++) {
@@ -254,16 +306,16 @@ void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
   fpga_free(tmp);
 }
 
-size_t align_element_n(int16_t **data_in, int num, int height, int width) {
+size_t align_element_n(int16_t** data_in, int num, int height, int width) {
   int unalign_n = num;
   int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
   int num_element = height * width * align_n;
   if (unalign_n != align_n) {
-    int16_t *tmp = *data_in;
+    int16_t* tmp = *data_in;
 
     int num_element = height * width * align_n;
-    int16_t *data_tmp =
-        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
+    int16_t* data_tmp =
+        (int16_t*)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
 
     memset(data_tmp, 0, num_element * sizeof(int16_t));
     for (int h = 0; h < height; h++) {
@@ -276,17 +328,37 @@ size_t align_element_n(int16_t **data_in, int num, int height, int width) {
       }
     }
     *data_in = data_tmp;
-    free(tmp);
+    fpga_free(tmp);
   }
   return num_element * sizeof(int16_t);
 }
 
+void to_fp16(float* src,
+             float16* dst,
+             int num,
+             int height,
+             int width,
+             float* scale_ptr) {
+  int size = num * height * width;
+  for (int n = 0; n < num; n++) {
+    float scale_val = scale_ptr[n];
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int index = n * height * width + h * width + w;
+        float value = src[index] * scale_val;
+        dst[index] = float_to_half(value);
+      }
+    }
+  }
+  fpga_flush(dst, size * sizeof(int16_t));
+}
+
 void quantize_to_fp16(
-    float **data_in, int num, int height, int width, float *scale_ptr) {
-  float *tmp = *data_in;
+    float** data_in, int num, int height, int width, float* scale_ptr) {
+  float* tmp = *data_in;
   int size = num * height * width;
 
-  float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16));  // NOLINT
+  float16* tmp_data = (float16*)fpga_malloc(size * sizeof(float16));  // NOLINT
   for (int n = 0; n < num; n++) {
     float scale_val = scale_ptr[n];
     for (int h = 0; h < height; h++) {
@@ -298,13 +370,14 @@ void quantize_to_fp16(
     }
   }
   fpga_flush(tmp_data, size * sizeof(int16_t));
-  *data_in = (float *)tmp_data;  // NOLINT
+  *data_in = (float*)tmp_data;  // NOLINT
   fpga_free(tmp);
 }
 size_t format_dwconv_filter(
-    float **data_in, int num, int height, int width, float *scale_ptr) {
+    float** data_in, int num, int height, int width, float* scale_ptr) {
   quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  int16_t** quantize_data = reinterpret_cast<int16_t**>(data_in);
+
   convert_to_hwn(quantize_data, num, height, width);
   size_t size = align_element_n(quantize_data, num, height, width);
   fpga_flush(*quantize_data,
diff --git a/lite/backends/fpga/KD/llapi/filter.h b/lite/backends/fpga/KD/llapi/filter.h
index 7d9c6c2e015250cbcba2d1dba71b7c1f3554d9f0..42d98e74923e116240b145c87b3dc5cfa0210f8d 100644
--- a/lite/backends/fpga/KD/llapi/filter.h
+++ b/lite/backends/fpga/KD/llapi/filter.h
@@ -18,38 +18,36 @@ limitations under the License. */
 #include <cstdlib>
 #include <cwchar>
 
+#include <vector>
+
 namespace paddle {
 namespace zynqmp {
 namespace filter {
 
 void set_filter_capacity(uint32_t cap);
+void set_colunm(uint32_t column);
+int get_filter_num_alignment();
 int calc_division_capacity(int chw);
 int calc_split_num(int num, int division_capacity);
 int calc_division_number(int num, int group_num, int division_capacity);
 int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(
-    char** data_in, int num, int channel, int height, int width);
+int calc_pack_num(int num_per_group, int group, int division_capacity);
+
 float find_max(float* data_in, int data_size);
-void quantize(float** data_in, int data_size, float max);
-void align_element(char** data_in, int num, int chw);
-void align_num(char** data_in,
-               int num_per_div_before_alignment,
-               int num,
-               int chw);
-void reorder(char** data_in, int num_after_alignment, int chw);
-size_t interleave(char** data_in, int num_after_alignment, int chw);
-size_t format_filter(float** data_in,
-                     int num,
-                     int channel,
-                     int height,
-                     int width,
-                     int group_num,
-                     float max);
+int8_t* format_filter(float* data_in,
+                      int& mem_size,  // NOLINT
+                      int num,
+                      int channel,
+                      int height,
+                      int width,
+                      int group_num,
+                      float max,
+                      std::vector<float>& filter_max);  // NOLINT
 
 void convert_to_hwn(int16_t** data_in, int num, int height, int width);
 size_t align_element_n(int16_t** data_in, int num, int height, int width);
-void quantize_to_fp16(
-    float** data_in, int num, int height, int width, float* scale_ptr);
+// void quantize_to_fp16(float** data_in, int num, int height, int width,
+//                       float* scale_ptr);
 size_t format_dwconv_filter(
     float** data_in, int num, int height, int width, float* scale_ptr);
 
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
index 1f1226ead3d4e9b50100f4de574104a5d6f777b2..bcbf2b98f487aea3c6516fa6369e70d11be97ffc 100644
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -23,13 +23,12 @@ limitations under the License. */
 #include <map>
 #include <utility>
 
-#include "lite/backends/fpga/KD/llapi/config.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 
 namespace paddle {
 namespace zynqmp {
 
-#define PADDLE_LITE_OS_LINUX
+#define PADDLE_MOBILE_OS_LINUX
 
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
@@ -39,14 +38,10 @@ static size_t memory_size_max = 0;
 static size_t memory_size = 0;
 
 static inline int do_ioctl(uint64_t req, const void *arg) {
-  int ret = -1;
-#ifdef PADDLE_LITE_OS_LINUX
-  ret = ioctl(fd, req, arg);
-  if (ret != 0) {
-    throw - 1;
-  }
+#ifdef PADDLE_MOBILE_OS_LINUX
+  return ioctl(fd, req, arg);
 #else
-  return ret;
+  return -1;
 #endif
 }
 
@@ -66,15 +61,33 @@ void reset_device() {
 
 // memory management;
 void *fpga_malloc(size_t size) {
-#ifdef PADDLE_LITE_OS_LINUX
+#ifdef PADDLE_MOBILE_OS_LINUX
+
   void *ptr = reinterpret_cast<void *>(
       mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
-  if (ptr == NULL) {
+  if (ptr == MAP_FAILED) {
     std::cout << "not enough memory !";
     exit(-1);
   }
+  if (errno == ENOMEM) {
+    std::cout << "mmap failed with not enough memory !";
+    exit(-1);
+  }
+  if (errno == EINVAL) {
+    std::cout << "mmap failed with invalid arguments ! (size=" << size << ")"
+              << std::endl;
+    exit(-1);
+  }
+  if (ptr == NULL) {
+    std::cout << "NULL returned, errno=" << errno
+              << ", mmap failed with other errors other than memory usage !"
+              << std::endl;
+    exit(-1);
+  }
+
   memory_map.insert(std::make_pair(ptr, size));
   memory_size += size;
+
   if (memory_size > memory_size_max) {
     memory_size_max = memory_size;
   }
@@ -90,7 +103,7 @@ size_t fpga_get_memory_size_max() { return memory_size_max; }
 
 size_t fpga_diagnose_memory(int detailed) {
   size_t total = 0;
-  auto iter = memory_map.begin();  // std::map<void *, size_t>::iterator
+  auto iter = memory_map.begin();
   while (iter != memory_map.end()) {
     total += iter->second;
     iter++;
@@ -100,7 +113,7 @@ size_t fpga_diagnose_memory(int detailed) {
 
 void fpga_free(void *ptr) {
   size_t size = 0;
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  auto iter = memory_map.find(ptr);
   if (iter != memory_map.end()) {
     size = iter->second;
     memory_map.erase(iter);
@@ -108,8 +121,7 @@ void fpga_free(void *ptr) {
 
   memory_size -= size;
 
-#ifdef PADDLE_LITE_OS_LINUX
-
+#ifdef PADDLE_MOBILE_OS_LINUX
   munmap(ptr, size);
 #else
   free(ptr);
@@ -150,6 +162,11 @@ void fpga_copy(void *dest, const void *src, size_t num) {
   memcpy(dest, src, num);
 }
 
+int fpga_reset() {
+  struct FpgaResetArgs args;
+  return do_ioctl(IOCTL_FPGA_RESET, &args);
+}
+
 int ioctl_conv(const struct ConvArgs &args) {
   return do_ioctl(IOCTL_CONFIG_CONV, &args);
 }
@@ -166,7 +183,6 @@ int compute_fpga_conv(const struct SplitConvArgs &args) {
   }
 
   if (split_num > 1) {
-    std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl;
     exit(-1);
   }
   return ret;
@@ -186,6 +202,7 @@ int get_device_info(const struct DeviceInfo &args) {
 }
 
 int perform_bypass(const struct BypassArgs &args) {
+  int ret = -1;
   int size = args.image.channels * args.image.width * args.image.height;
   int max_size = 1 << 21;
 
@@ -213,7 +230,7 @@ int perform_bypass(const struct BypassArgs &args) {
         reinterpret_cast<char *>(input_address + i * max_size * type_size);
     bypassArgs.output.address =
         reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
-    int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+    ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
     scale = std::max(scale, scales[0]);
 
     if (ret != 0) {
@@ -222,13 +239,16 @@ int perform_bypass(const struct BypassArgs &args) {
   }
 
   int remainder = size - max_size * count;
-  bypassArgs.image.channels = remainder;
-  bypassArgs.image.address =
-      reinterpret_cast<char *>(input_address + count * max_size * type_size);
-  bypassArgs.output.address = reinterpret_cast<char *>(
-      output_address + count * max_size * out_type_size);
-  int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
-  scale = std::max(scale, scales[0]);
+  if (remainder > 0) {
+    bypassArgs.image.channels = remainder;
+    bypassArgs.image.address =
+        reinterpret_cast<char *>(input_address + count * max_size * type_size);
+    bypassArgs.output.address = reinterpret_cast<char *>(
+        output_address + count * max_size * out_type_size);
+    ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+    scale = std::max(scale, scales[0]);
+  }
+
   args.output.scale_address[0] = scale;
   args.output.scale_address[1] = 1.0f / scale;
   return ret;
@@ -261,28 +281,13 @@ int compute_fpga_scale(const struct ScaleArgs &args) {
 }
 
 int compute_fpga_dwconv(const struct DWconvArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Basic Conv======";
-  std::cout << "   relu_enabled:" << args.relu_enabled
-            << "   filter_address:" << args.filter_address;
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-  std::cout << "   kernel_height:" << args.kernel.height
-            << "   kernel_width:" << args.kernel.width
-            << "   stride_h:" << args.kernel.stride_h
-            << "   stride_w:" << args.kernel.stride_w;
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-
-#endif
   return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
 }
 
+int config_activation(const struct ActiveParamterArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
+}
+
 int config_inplace(const struct InplaceArgs &args) {
   return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
 }
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.h b/lite/backends/fpga/KD/llapi/zynqmp_api.h
index 7d22de95a2272862c6fe781295bdaab7177a92fe..55c2fde079a1ca0ec368870e2bb8f727d870a8f3 100644
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.h
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H
+#define PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H
+
 #include <stdint.h>
 #include <cstddef>
 #include <iostream>
@@ -25,7 +28,6 @@ namespace zynqmp {
 typedef int16_t half;
 
 #define IMAGE_ALIGNMENT 16           // Aligned to 16
-#define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
 #define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
 #define BS_NUM_ALIGNMENT 8
 #define BIAS_NUM_ALIGNMENT 16
@@ -40,15 +42,19 @@ enum DLayoutType {
   LAYOUT_HWC = 0,
 };
 
-struct VersionArgs {
-  void* buffer;
+enum ActiveType {
+  TYPE_NONE = 0,
+  TYPE_RELU = 1,
+  TYPE_RELU6 = 2,
+  TYPE_LEAKY_RELU = 3,
+  TYPE_SIGMOID = 4,
 };
 
 struct DeviceInfo {
   uint32_t filter_cap;
   uint32_t version;
   uint16_t device_type;
-  uint32_t reserved0;
+  uint32_t colunm;
   uint32_t reserved1;
   uint32_t reserved2;
   uint32_t reserved3;
@@ -57,6 +63,11 @@ struct DeviceInfo {
   uint32_t reserved6;
 };
 
+struct VersionArgs {
+  void* buffer;
+  size_t size;
+};
+
 struct MemoryCopyArgs {
   void* src;
   void* dest;
@@ -68,7 +79,9 @@ struct MemoryCacheArgs {
   size_t size;
 };
 
-struct MemoryBarrierArgs {};
+struct MemoryBarrierArgs {
+  uint16_t dummy;
+};
 
 struct BNArgs {
   bool enabled;
@@ -108,6 +121,7 @@ struct ConvArgs {
   void* filter_scale_address;
   uint32_t filter_num;
   uint32_t group_num;
+  uint32_t dilation;
 
   struct KernelArgs kernel;
   struct ImageInputArgs image;  // input image;
@@ -199,9 +213,16 @@ struct NormalizeParameterArgs {
   uint32_t hight_width;
 };
 
+struct ActiveParamterArgs {
+  ActiveType type;
+  uint16_t leaky_relu_factor;
+};
+
 struct InplaceArgs {
   bool leaky_relu_enable;
   bool relu_enable;
+  bool sigmoid_enable;
+  bool relu6_enable;
   bool power_enable;
   bool normalize_enable;
 };
@@ -216,7 +237,9 @@ struct FpgaRegReadArgs {
   uint64_t value;
 };
 
-struct FpgaResetArgs {};
+struct FpgaResetArgs {
+  uint32_t val;
+};
 
 #define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
 
@@ -248,6 +271,8 @@ struct FpgaResetArgs {};
   _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
 #define IOCTL_CONFIG_NORMALIZE_PARAMETER \
   _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
+#define IOCTL_CONFIG_ACTIVATION_PARAMETER \
+  _IOW(IOCTL_FPGA_MAGIC, 43, struct ActiveParamterArgs)
 #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
 #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
 #define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
@@ -331,6 +356,7 @@ int compute_fpga_scale(const struct ScaleArgs& args);
 int compute_fpga_concat(const struct ConcatArgs& args);
 int compute_fpga_resize(const struct ResizeArgs& args);
 
+int config_activation(const struct ActiveParamterArgs& args);
 int config_power(const struct PowerArgs& args);
 int compute_fpga_dwconv(const struct DWconvArgs& args);
 int config_norm_param(const struct NormalizeParameterArgs& args);
@@ -341,7 +367,11 @@ int config_inplace(const struct InplaceArgs& args);
 int flush_cache(void* addr, int size);
 int invalidate_cache(void* addr, int size);
 
+int fpga_reset();
+
 int16_t fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(int16_t fp16_num);
 }  // namespace zynqmp
 }  // namespace paddle
+
+#endif  // PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H
diff --git a/lite/backends/fpga/KD/pe.hpp b/lite/backends/fpga/KD/pe.hpp
index d1dc3c4caa18cbfeba74fac26cca9e19230e2c21..2796124341012574dc719ae9f30633d1d9524680 100644
--- a/lite/backends/fpga/KD/pe.hpp
+++ b/lite/backends/fpga/KD/pe.hpp
@@ -32,6 +32,5 @@ class PE {
 
   virtual ~PE() {}
 };
-
 }  // namespace zynqmp
 }  // namespace paddle
diff --git a/lite/backends/fpga/KD/pe_params.hpp b/lite/backends/fpga/KD/pe_params.hpp
index 709f04d399793c6f21c34fc1265f7ed8b5818314..42ec32957e5884aaae3cc96f46060de114b44ead 100644
--- a/lite/backends/fpga/KD/pe_params.hpp
+++ b/lite/backends/fpga/KD/pe_params.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdio.h>
+#include <string>
 #include <vector>
 
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
@@ -26,10 +27,16 @@ namespace zynqmp {
 struct ReLUParam {
  public:
   bool enabled = false;
+  float leaky_relu_factor = 0.0f;
+};
+
+struct ActiveParam {
+  enum ActiveType type = TYPE_NONE;
+  float leaky_relu_factor;
 };
 
 struct PEParam {
-  ReLUParam relu;
+  ActiveParam activeParam;
 };
 
 struct InputParam : PEParam {
@@ -133,6 +140,13 @@ struct ElementwiseAddParam : PEParam {
   EWAddArgs ewargs;
 };
 
+struct ElementwiseMulParam : PEParam {
+ public:
+  Tensor* input_x;
+  Tensor* input_y = nullptr;
+  Tensor* output = nullptr;
+};
+
 struct FullyConnectedParam : PEParam {
  public:
   Tensor* input = nullptr;
@@ -197,6 +211,17 @@ struct PriorBoxParam : PEParam {
   float offset;
 };
 
+struct YoloBoxParam : PEParam {
+  Tensor* input;
+  Tensor* imgSize;
+  Tensor* outputBoxes;
+  Tensor* outputScores;
+  int downsampleRatio;
+  std::vector<int> anchors;
+  int classNum;
+  float confThresh;
+};
+
 struct ScaleParam : PEParam {
  public:
   Tensor* input = nullptr;
@@ -229,5 +254,24 @@ struct CropParam : PEParam {
   std::vector<int> offsets;
   std::vector<int> shape;
 };
+
+struct GRUParam : PEParam {
+ public:
+  Tensor* input = nullptr;
+  Tensor* h0 = nullptr;
+  Tensor* weight = nullptr;
+  Tensor* bias = nullptr;
+
+  Tensor* batch_gate = nullptr;
+  Tensor* batch_reset_hidden_prev = nullptr;
+  Tensor* batch_hidden = nullptr;
+  Tensor* hidden = nullptr;
+
+  std::string gate_activation = "sigmoid";
+  std::string activation = "tanh";
+  bool is_reverse = false;
+  bool origin_mode = false;
+};
+
 }  // namespace zynqmp
 }  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp
index e897f82280fa57f904bd7c749e371d8ec9219b51..b4eac2c41e138cab19197ccb8ab89681a69ec6fe 100644
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <arm_neon.h>
+#include <algorithm>
 #include <vector>
 
 #include "lite/backends/fpga/KD/pe.hpp"
@@ -24,6 +25,7 @@ limitations under the License. */
 #include "lite/backends/fpga/KD/pes/conv_process.hpp"
 #include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
 #include "lite/backends/fpga/KD/pes/scale_pe.hpp"
+#include "lite/backends/fpga/KD/pes/split_pe.hpp"
 
 namespace paddle {
 namespace zynqmp {
@@ -40,6 +42,8 @@ class ConvPE : public PE {
   void apply() {
     split_axis = fill_split_arg(param_);
 
+    split_channel = param_.groups != 1 && param_.splitParams().size() > 1;
+
     if (split_axis == 0 && param_.splitParams().size() > 1) {
       ConcatParam& concat_param = concatPE_.param();
       for (auto conv_param : param_.splitParams()) {
@@ -49,6 +53,28 @@ class ConvPE : public PE {
       concatPE_.init();
       concatPE_.apply();
     }
+
+    if (split_channel) {
+      SplitParam& split_param = splitPE_.param();
+      split_param.input = param_.input;
+      for (auto conv_param : param_.splitParams()) {
+        split_param.outputs.push_back(&conv_param->input);
+      }
+      splitPE_.init();
+      splitPE_.apply();
+    }
+
+    if (DLEngine::get_instance().isZU3() &&
+        param_.input->shape().dimSize() == 4 &&
+        param_.input->shape().width() == 1 &&
+        param_.input->shape().channel() >= 2048) {
+      use_cpu_ = true;
+    }
+    if (!use_cpu_) {
+      // param_.filter->releaseData();
+    }
+
+    // exit(-1);
   }
   void cpu_compute() {
     Tensor* input = param_.input;
@@ -59,6 +85,7 @@ class ConvPE : public PE {
     Tensor float_output;
     float* image_addr = float_input.mutableData<float>(FP32, input->shape());
     float_input.copyFrom(input);
+    // float16* data_out = output->data<float16>();
     float* out = float_output.mutableData<float>(FP32, output->shape());
 
     int out_channel = output->shape().channel();
@@ -66,13 +93,21 @@ class ConvPE : public PE {
 
     float* filter_data = param_.filter->data<float>();
     float* mi = new float[in_channel];
-
     for (int i = 0; i < out_channel; i++) {
       float* image = image_addr;
       float* filter_ptr = filter_data + i * in_channel;
       float* out_ptr = mi;
 #pragma omp parallel for
       for (int j = 0; j < in_channel; j++) {
+        // float32x4_t x0 = vld1q_f32(image);
+        // float32x4_t x1 = vld1q_f32(filter_ptr);
+
+        // float32x4_t r = vmulq_f32(x0, x1);
+
+        // vst1q_f32(out_ptr, r);
+        // image += 4;
+        // filter_ptr += 4;
+        // out_ptr += 4;
         float value = image_addr[j] * filter_ptr[j];
         mi[j] = value;
       }
@@ -89,49 +124,104 @@ class ConvPE : public PE {
   }
 
   bool dispatch() {
-    inplace_.relu_enable = param_.relu.enabled;
-    inplace_.power_enable = false;
-    inplace_.normalize_enable = false;
+    fpga_reset();
+    if (use_cpu_) {
+      cpu_compute();
+      return true;
+    }
 
-    if (param_.relu.enabled) {
-      inplace_.relu_enable = param_.relu.enabled;
+    if (param_.activeParam.type == TYPE_RELU) {
+      inplace_.relu_enable = true;
+    } else if (param_.activeParam.type == TYPE_RELU6) {
+      inplace_.relu6_enable = true;
+    } else if (param_.activeParam.type == TYPE_SIGMOID) {
+      inplace_.sigmoid_enable = true;
+    } else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
+      inplace_.leaky_relu_enable = true;
+    }
+
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
       config_inplace(inplace_);
+      if (inplace_.leaky_relu_enable) {
+        activeParamterArgs.type = TYPE_LEAKY_RELU;
+        activeParamterArgs.leaky_relu_factor =
+            fp32_2_fp16(param_.activeParam.leaky_relu_factor);
+        config_activation(activeParamterArgs);
+      }
     }
 
     std::vector<BasicConvParam*>& params = param_.splitParams();
+
+    if (split_channel) {
+      // splitPE_.param().input->saveToFile("input_image",true);
+      splitPE_.dispatch();
+    }
+
     int ret = 0;
     for (auto conv_param : params) {
+      // conv_param->input.printScale();
+      // if (split_channel) {
+      //   conv_param->input.saveToFile("pack_image",true);
+      // }
       ret |= compute_fpga_conv_basic(conv_param->args);
     }
 
-    if (param_.relu.enabled) {
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
       inplace_.relu_enable = false;
+      inplace_.leaky_relu_enable = false;
+      inplace_.relu6_enable = false;
+      inplace_.sigmoid_enable = false;
       config_inplace(inplace_);
+
+      if (inplace_.leaky_relu_enable) {
+        activeParamterArgs.type = TYPE_LEAKY_RELU;
+        activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
+        config_activation(activeParamterArgs);
+      }
     }
 
     size_t size = params.size();
     if (split_axis == 0 && ret == 0 && size > 1) {
+      // std::cout << "concat size:" << size << std::endl;
       concatPE_.dispatch();
     }
     if (split_axis == 1 && ret == 0 && size > 1) {
+      // for (int n = 0; n < size - 1; n++) {
       ElementwiseAddParam& add_param = addPE_.param();
       add_param.inputs = {&params[0]->output, &params[1]->output};
       add_param.output = param_.output;
       addPE_.init();
       addPE_.apply();
       addPE_.dispatch();
+
+      // param_.output->printScale();
+
+      // params[0]->input.saveToFile("conv_1.txt");
+      // params[1]->input.saveToFile("conv_2.txt");
+
+      // params[0]->output.saveToFile("ew_o1.txt");
+      // params[1]->output.saveToFile("ew_o2.txt");
+      // std::cout << "\n ================== EW ================== \n";
+      // }
     }
+
     return ret == 0;
   }
 
   ConvParam& param() { return param_; }
 
  private:
+  bool use_cpu_ = false;
+  bool split_channel = false;
   ConvParam param_;
   ConcatPE concatPE_;
+  SplitPE splitPE_;
   ElementwiseAddPE addPE_;
   int split_axis = 0;
   InplaceArgs inplace_ = {0};
+  ActiveParamterArgs activeParamterArgs;
 };
 
 }  // namespace zynqmp
diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp
old mode 100644
new mode 100755
index fd17218d06f050df3dc935bdde0a320e52b56a40..cea22e0edc647b3bf4f0ac15e43121b5d8926154
--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef conv_process_hpp
+#define conv_process_hpp
+
 #include <string.h>
 #include <cmath>
 #include <vector>
@@ -45,7 +48,19 @@ inline int get_split_num(Tensor* filter) {
              filter->shape().width();
   auto num = filter->shape().num();
   int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
+  int filter_num_alignment = filter::get_filter_num_alignment();
+  int aligned_num = align_to_x(num, filter_num_alignment);
+  return filter::calc_split_num(aligned_num, div_capacity);
+}
+
+inline int get_pack_num(Tensor* filter, int group_num) {
+  auto chw = filter->shape().channel() * filter->shape().height() *
+             filter->shape().width();
+  auto num = filter->shape().num();
+  int div_capacity = filter::calc_division_capacity(chw);
+  int filter_num_alignment = filter::get_filter_num_alignment();
+  int aligned_num_per_group = align_to_x(num / group_num, filter_num_alignment);
+  return filter::calc_pack_num(aligned_num_per_group, group_num, div_capacity);
 }
 
 inline void fill_scale_bias_const(ConvParam* param_) {
@@ -112,6 +127,50 @@ inline void combine_add_bn_params(BatchnormParam* bn,
   param_->bias()->setDataLocation(CPU);
 }
 
+inline int gcd_(int a, int b) {
+  while (b) {
+    int temp = a;
+    a = b;
+    b = temp % b;
+  }
+  return a;
+}
+
+inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
+
+inline void format_bias_scale_new(Tensor* bias,
+                                  Tensor* scale,
+                                  Tensor* scale_bias) {
+  Shape& bias_shape = bias->shape();
+  int channel = bias_shape.channel();
+  int repeat = 1;
+  int alignment = 16;
+  int length = channel;
+
+  if (channel % alignment != 0 || channel < alignment) {
+    int c_lcm = lcm_(channel, alignment);
+    repeat = c_lcm / (channel);
+  }
+  Shape shape(N, {2 * channel * repeat});
+  float16* scale_bias_data = scale_bias->mutableData<float16>(FP16, shape);
+
+  float* bias_data_float = bias->data<float>();
+  float* scale_data_float = scale->data<float>();
+
+  for (int i = 0; i < repeat; i++) {
+    for (int j = 0; j < length; j++) {
+      float16 value_bias = float_to_half(bias_data_float[j]);
+      scale_bias_data[i * length + j] = value_bias;
+    }
+  }
+  for (int i = 0; i < repeat; i++) {
+    for (int j = 0; j < length; j++) {
+      float16 value_scale = float_to_half(scale_data_float[j]);
+      scale_bias_data[i * length + j + length * repeat] = value_scale;
+    }
+  }
+}
+
 inline void format_scale_bias(Tensor* scale,
                               Tensor* bias,
                               Tensor* filter,
@@ -126,41 +185,99 @@ inline void format_scale_bias(Tensor* scale,
     bias_data = bias->data<float>();
   }
   int channel = filter->shape().num();
-  Shape bias_scale_shape(N, {2 * channel});
+  int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group;
+
+  int c_per_group = channel / group;
+  int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT);
+
+  Shape bias_scale_shape(N, {2 * scale_bias_len});
   float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
-  for (int i = 0; i < channel; i++) {
-    float scale_value = scale_data == nullptr ? 1 : scale_data[i];
-    float bias_value = bias_data == nullptr ? 0 : bias_data[i];
-    bs_data[i + channel] = scale_value;
-    bs_data[i] = bias_value;
+  float* temp_data =
+      reinterpret_cast<float*>(fpga_malloc(2 * scale_bias_len * sizeof(float)));
+  memset(temp_data, 0, 2 * scale_bias_len * sizeof(float));
+
+  std::vector<float> scales;
+  if (scale_data != nullptr) {
+    for (int i = 0; i < channel; ++i) {
+      scales.push_back(scale_data[i]);
+    }
+    for (int i = 0; i < scale_bias_len - channel; i++) {
+      scales.push_back(1);
+    }
+  } else {
+    for (int i = 0; i < scale_bias_len; i++) {
+      scales.push_back(1);
+    }
+  }
+
+  for (int i = 0; i < scale_bias_len; ++i) {
+    temp_data[i + scale_bias_len] = 1;
+    temp_data[i] = 0;
   }
 
-  int element_num_per_div = get_filter_num_per_div(filter, group);
-  bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel);
+  for (int g = 0; g < group; g++) {
+    for (int c = 0; c < c_per_group; c++) {
+      int src_index = g * c_per_group + c;
+      int dst_index = g * aligned_c_per_group + c;
+      float scale_value = scales[src_index];
+      float bias_value = bias_data == nullptr ? 0 : bias_data[src_index];
+      temp_data[dst_index + scale_bias_len] = scale_value;
+      temp_data[dst_index] = bias_value;
+    }
+  }
+
+  bias_scale::format_bias_scale_array(
+      &temp_data, scale_bias_len / group, scale_bias_len);
+  memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
 }
 
-inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) {
+inline void format_filter(Tensor* filter,
+                          Tensor* quantized_filter,
+                          int group,
+                          std::vector<float>& scales,  // NOLINT
+                          float max) {
   float max_value = find_max(*filter);
+  // max_value = max; //TODO: global quantization for filter
   Shape& filter_shape = filter->shape();
+
+  int mem_size;
+  std::vector<float> max_values;
+  int8_t* quantized_data = filter::format_filter(filter->data<float>(),
+                                                 mem_size,
+                                                 filter_shape.num(),
+                                                 filter_shape.channel(),
+                                                 filter_shape.height(),
+                                                 filter_shape.width(),
+                                                 group,
+                                                 max_value,
+                                                 max_values);
+
+  float mem_factor = mem_size * 1.0f / filter->shape().numel();
+  quantized_filter->setMemScale(mem_factor);
+
   quantized_filter->setAligned(true);
-  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
+  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
   quantized_filter->scale()[0] = max_value / 127.0f;
   quantized_filter->scale()[1] = 127.0f / max_value;
 
-  auto memory_size = filter->shape().memorySize(sizeof(float));
-  auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size));
-  memcpy(new_data, filter->data<float>(), memory_size);
-  size_t mem_size = filter::format_filter(&new_data,
-                                          filter_shape.num(),
-                                          filter_shape.channel(),
-                                          filter_shape.height(),
-                                          filter_shape.width(),
-                                          group,
-                                          max_value);
-  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  memcpy(src, new_data, mem_size);
-  fpga_free(new_data);
+  memcpy(src, quantized_data, mem_size);
   quantized_filter->flush();
+  fpga_free(quantized_data);
+
+  // for (size_t i = 0; i < max_values.size(); i++) {
+  //   // scales.push_back(max_values[i] / max_value);
+  //   scales.push_back(1.0f);
+  // }
+
+  // filter->saveToFile("filter.txt");
+  // std::ofstream ofs;
+  // ofs.open("quant.txt");
+  // for (int i = 0; i < mem_size; i++) {
+  //   float value = quantized_data[i];
+  //   ofs << value << std::endl;
+  // }
+  // ofs.close();
+  // exit(-1);
 }
 
 inline void format_dw_filter(Tensor* filter,
@@ -207,10 +324,11 @@ inline void split_filter_num(const ConvParam& c_param) {
   Tensor* out = param.output;
   Tensor* filter = param.filter;
   auto channel = out->shape().channel();
-
-  int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
+  int split_num = get_split_num(param.filter);
   int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
 
+  float max = find_max(*filter);
+
   Shape& out_shape = out->shape();
   for (int i = 0; i < split_num; i++) {
     BasicConvParam* conv_param = new BasicConvParam();
@@ -251,17 +369,18 @@ inline void split_filter_num(const ConvParam& c_param) {
            filter->data<float>() + i * filter_num_per_div * filter_hwc,
            filter_num * filter_hwc * sizeof(float));
     new_filter.flush();
-
     conv_param->filter.mutableData<float>(FP32, f_shape);
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
 
-    int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
+    std::vector<float> v;  // TODO(chonwhite) change variable name;
+    format_filter(&new_filter, &(conv_param->filter), param.groups, v, max);
+    conv_param->filter.setDataType(INT8);
+
     Tensor scale;
     Tensor bias;
 
     int chnnnel_start = i * filter_num_per_div;
 
-    Shape s_shape(N, {filter_num});
+    Shape s_shape(NC, {1, filter_num});
     float* scale_data = scale.mutableData<float>(FP32, s_shape);
     float* bias_data = bias.mutableData<float>(FP32, s_shape);
     for (int n = 0; n < filter_num; n++) {
@@ -270,17 +389,11 @@ inline void split_filter_num(const ConvParam& c_param) {
     for (int n = 0; n < filter_num; n++) {
       bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
     }
-    Shape sb_shape(N, {sb_num});
-    format_scale_bias(&scale,
-                      &bias,
-                      &conv_param->filter,
-                      &conv_param->scaleBias,
-                      param.groups);
+    format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
     conv_param->scaleBias.flush();
 
     args.group_num = param.groups;
-    args.relu_enabled = param.relu.enabled;
-    args.sb_address = conv_param->scaleBias.data<float>();
+    args.sb_address = conv_param->scaleBias.data<float16>();
     args.kernel.stride_h = param.strides[1];
     args.kernel.stride_w = param.strides[0];
     args.kernel.height = new_filter.shape().height();
@@ -296,6 +409,137 @@ inline void split_filter_num(const ConvParam& c_param) {
     args.image.height = input->shape().height();
     args.image.pad_width = param.paddings[1];
     args.image.pad_height = param.paddings[0];
+    args.dilation = param.dilations[0];
+
+    args.output.address = out_address;
+    args.output.scale_address = out_scale_address;
+    param.splitParams().push_back(conv_param);
+  }
+}
+
+inline void pack_channel_filter(const ConvParam& c_param) {
+  ConvParam& param = const_cast<ConvParam&>(c_param);
+  Tensor* input = param.input;
+  Tensor* out = param.output;
+  Tensor* filter = param.filter;
+  int filter_num_alignment = filter::get_filter_num_alignment();
+  auto filter_num = filter->shape().num();
+  int pack_num = get_pack_num(param.filter, param.groups);
+  int group_per_pack = (param.groups + pack_num - 1) / pack_num;
+  int filter_per_group = filter_num / param.groups;
+  int filter_per_pack = filter_per_group * group_per_pack;
+  int channel_per_pack = filter->shape().channel() * group_per_pack;
+
+  float max = find_max(*filter);
+
+  Shape& out_shape = out->shape();
+
+  for (int i = 0; i < pack_num; i++) {
+    BasicConvParam* conv_param = new BasicConvParam();
+
+    conv_param->output.setDataLocation(Device);
+    conv_param->output.setAligned(true);
+
+    float16* out_address = nullptr;
+    float* out_scale_address = nullptr;
+
+    float16* input_address = nullptr;
+
+    ConvArgs& args = conv_param->args;
+
+    if (pack_num == 1) {
+      out_address = out->data<float16>();
+      out_scale_address = out->scale();
+    }
+
+    int new_group = param.groups;
+    int filter_current_pack = filter->shape().num();
+    int channel_current_pack = input->shape().channel();
+
+    new_group = i == pack_num - 1
+                    ? param.groups - (pack_num - 1) * group_per_pack
+                    : group_per_pack;
+    filter_current_pack = new_group * filter_per_group;
+    channel_current_pack = new_group * filter->shape().channel();
+
+    if (pack_num == 1) {
+      input_address = input->data<float16>();
+    } else {
+      Shape in_shape(NCHW,
+                     {1,
+                      channel_current_pack,
+                      input->shape().height(),
+                      input->shape().width()});
+      input_address = conv_param->input.mutableData<float16>(FP16, in_shape);
+    }
+
+    if (pack_num != 1) {
+      Shape shape(
+          NHWC,
+          {1, out_shape.height(), out_shape.width(), filter_current_pack});
+      out_address = conv_param->output.mutableData<float16>(FP16, shape);
+      out_scale_address = conv_param->output.scale();
+    }
+    Shape f_shape(NCHW,
+                  {filter_current_pack,
+                   filter->shape().channel(),
+                   filter->shape().height(),
+                   filter->shape().width()});
+
+    Tensor new_filter;
+    float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
+    int filter_hwc = filter->shape().height() * filter->shape().width() *
+                     filter->shape().channel();
+
+    memcpy(new_filter_data,
+           filter->data<float>() + i * filter_per_pack * filter_hwc,
+           filter_current_pack * filter_hwc * sizeof(float));
+    new_filter.flush();
+    conv_param->filter.mutableData<float>(FP32, f_shape);
+
+    float mem_factor = filter_num_alignment / filter_per_pack;
+    conv_param->filter.setMemScale(mem_factor);
+
+    std::vector<float> v;  // TODO(chonwhite) change variable name
+    format_filter(&new_filter, &(conv_param->filter), new_group, v, max);
+    conv_param->filter.setDataType(INT8);
+
+    Tensor scale;
+    Tensor bias;
+
+    int chnnnel_start = i * filter_per_pack;
+
+    Shape s_shape(NC, {1, filter_current_pack});
+    float* scale_data = scale.mutableData<float>(FP32, s_shape);
+    float* bias_data = bias.mutableData<float>(FP32, s_shape);
+    for (int n = 0; n < filter_current_pack; n++) {
+      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
+    }
+    for (int n = 0; n < filter_current_pack; n++) {
+      bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
+    }
+    format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
+    conv_param->scaleBias.flush();
+
+    args.group_num = new_group;
+    args.sb_address = conv_param->scaleBias.data<float16>();
+    args.kernel.stride_h = param.strides[1];
+    args.kernel.stride_w = param.strides[0];
+    args.kernel.height = new_filter.shape().height();
+    args.kernel.width = new_filter.shape().width();
+
+    args.filter_address = conv_param->filter.data<int8_t>();
+    args.filter_num = filter_current_pack;
+    args.filter_scale_address = conv_param->filter.scale();
+    args.image.address = input_address;
+    args.image.scale_address = input->scale();
+    args.image.channels = channel_current_pack;
+    args.image.width = input->shape().width();
+    args.image.height = input->shape().height();
+    args.image.pad_width = param.paddings[1];
+    args.image.pad_height = param.paddings[0];
+    args.dilation = param.dilations[0];
+
     args.output.address = out_address;
     args.output.scale_address = out_scale_address;
     param.splitParams().push_back(conv_param);
@@ -310,9 +554,11 @@ inline void split_channel(const ConvParam& c_param) {
 
   int num = ceil(input->shape().channel() * 1.0f / 2047);
   int channel = input->shape().channel() / num;
-  std::cout << "channel::" << channel << "num::" << num << std::endl;
+
   Shape bs_shape(N, {channel});
 
+  float max = 1.0f;
+
   for (int i = 0; i < num; i++) {
     BasicConvParam* conv_param = new BasicConvParam();
 
@@ -324,6 +570,7 @@ inline void split_channel(const ConvParam& c_param) {
 
     // filter transformation;
     Shape f_shape(NCHW, {param.filter->shape().num(), channel, 1, 1});
+
     Tensor new_filter;
 
     float* dst = new_filter.mutableData<float>(FP32, f_shape);
@@ -334,7 +581,9 @@ inline void split_channel(const ConvParam& c_param) {
       src += param.filter->shape().channel();
     }
     new_filter.flush();
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
+    std::vector<float> scales;
+    format_filter(
+        &new_filter, &(conv_param->filter), param.groups, scales, max);
 
     Tensor bias;
     Tensor scale;
@@ -356,7 +605,6 @@ inline void split_channel(const ConvParam& c_param) {
 
     ConvArgs& args = conv_param->args;
     args.group_num = param.groups;
-    args.relu_enabled = param.relu.enabled;
     args.sb_address = conv_param->scaleBias.data<float>();
     args.kernel.stride_h = param.strides[1];
     args.kernel.stride_w = param.strides[0];
@@ -374,6 +622,7 @@ inline void split_channel(const ConvParam& c_param) {
     args.image.height = conv_param->input.shape().height();
     args.image.pad_width = param.paddings[1];
     args.image.pad_height = param.paddings[0];
+    args.dilation = param.dilations[0];
     args.output.address = conv_param->output.mutableData<void>();
     args.output.scale_address = conv_param->output.scale();
     param.splitParams().push_back(conv_param);
@@ -384,13 +633,17 @@ inline int fill_split_arg(const ConvParam& c_param) {
   ConvParam& param = const_cast<ConvParam&>(c_param);
   Tensor* input = param.input;
   Tensor* output = param.output;
+
   if (output->shape().dimSize() == 4 && input->shape().channel() > 2047 &&
       input->shape().width() == 1) {
     split_channel(c_param);
     return 1;
-  } else {
+  } else if (param.groups == 1) {
     split_filter_num(c_param);
     return 0;
+  } else {
+    pack_channel_filter(c_param);
+    return 0;
   }
 }
 
@@ -407,7 +660,6 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
     for (int i = 0; i < 1; i++) {
       for (int i = 0; i < img.shape().numel(); i++) {
         float value = half_to_float(img.data<float16>()[i]);
-        std::cout << "value:" << value << std::endl;
       }
     }
   }
@@ -416,3 +668,5 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
 
 }  // namespace zynqmp
 }  // namespace paddle
+
+#endif /* conv_process_hpp */
diff --git a/lite/backends/fpga/KD/pes/crop_pe.cpp b/lite/backends/fpga/KD/pes/crop_pe.cpp
old mode 100644
new mode 100755
index c29df623aa610d329a46ee337cdcb1abd801881c..1438aaba6565cefa72f863d5fc3af0a389fc95e0
--- a/lite/backends/fpga/KD/pes/crop_pe.cpp
+++ b/lite/backends/fpga/KD/pes/crop_pe.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "lite/backends/fpga/KD/pes/crop_pe.hpp"
 
-#include <vector>
-
 namespace paddle {
 namespace zynqmp {
 
diff --git a/lite/backends/fpga/KD/pes/crop_pe.hpp b/lite/backends/fpga/KD/pes/crop_pe.hpp
index 6ebbcdb31f1afb7939c75a2ba9254c0b31f67d31..ccd1e0c98968375ebd840c7e8b15aedd6ad7ef77 100755
--- a/lite/backends/fpga/KD/pes/crop_pe.hpp
+++ b/lite/backends/fpga/KD/pes/crop_pe.hpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <cstring>
 #include <vector>
 
diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
index 9d7b9b544bff953662bab86f095823c5c7b3075b..9958990af6eb237d2122a63e1b7ed947ca329d31 100755
--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -24,6 +24,17 @@ namespace zynqmp {
 
 class DepthwiseConvPE : public PE {
  public:
+  inline int gcd_(int a, int b) {
+    while (b) {
+      int temp = a;
+      a = b;
+      b = temp % b;
+    }
+    return a;
+  }
+
+  inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
+
   bool init() {
     Tensor* output = param_.output;
     output->setAligned(true);
@@ -37,18 +48,61 @@ class DepthwiseConvPE : public PE {
     Tensor* output = param.output;
     int channel = output->shape().channel();
 
-    float* new_scale_data = param_.scale()->data<float>();
-    float* new_bias_data = param_.bias()->data<float>();
+    int repeat = 1;
+    int alignment = 16;
+    int length = channel;
 
-    float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
-    for (int i = 0; i < channel; i++) {
-      b_data[i] = float_to_half(new_bias_data[i]);
+    if (channel % alignment != 0 || channel < alignment) {
+      int c_lcm = lcm_(channel, alignment);
+      repeat = c_lcm / (channel);
+    }
+    Shape shape(N, {channel * repeat});
+
+    float16* b_data = bias_.mutableData<float16>(FP16, shape);
+    if (param_.bias()->dataType() == FP32) {
+      float* new_bias_data = param_.bias()->data<float>();
+      // bias从float转换成float16
+      // for (int i = 0; i < channel; i++) {
+      //   b_data[i] = float_to_half(new_bias_data[i]);
+      // }
+      // bias 按16对齐填充hw
+      for (int i = 0; i < repeat; i++) {
+        for (int j = 0; j < length; j++) {
+          float16 value = float_to_half(new_bias_data[j]);
+          b_data[i * length + j] = value;
+        }
+      }
+      bias_.flush();
+    } else {
+      float16* new_bias_data = param_.bias()->data<float16>();
+      // memcpy(b_data, new_bias_data, channel * sizeof(float16));
+      for (int i = 0; i < repeat; i++) {
+        for (int j = 0; j < length; j++) {
+          // float16 value = float_to_half(bias_data_float[j]);
+          b_data[i * length + j] = new_bias_data[j];
+        }
+      }
+      bias_.flush();
     }
-    bias_.flush();
 
-    Tensor* quantized_filter = param.quantizedFilter();
-    quantized_filter->mutableData<float16>(FP16, param.filter->shape());
-    format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
+    if (param_.scale()->dataType() == FP32) {
+      float* new_scale_data = param_.scale()->data<float>();
+      Tensor* quantized_filter = param.quantizedFilter();
+      quantized_filter->mutableData<float16>(FP16, param.filter->shape());
+      format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
+
+    } else {
+      // filter 全为1时，且channal为对齐时
+      float16* scale_data = param_.scale()->data<float16>();
+      float16* filter_data = param.quantizedFilter()->mutableData<float16>(
+          FP16, param.filter->shape());
+
+      // memcpy(filter_data, scale_data, channel * sizeof(float16));
+      memcpy(filter_data,
+             scale_data,
+             param.filter->shape().numel() * sizeof(float16));
+      param.quantizedFilter()->flush();
+    }
 
     DWconvArgs args = {0};
     args.bias_address = b_data;
@@ -71,20 +125,33 @@ class DepthwiseConvPE : public PE {
     args.sub_conv_num = 1;
     param.args = args;
 
-    inplace_.relu_enable = param_.relu.enabled;
     inplace_.power_enable = false;
     inplace_.normalize_enable = false;
   }
 
   bool dispatch() {
     param_.input->syncToDevice();
-    if (param_.relu.enabled) {
-      inplace_.relu_enable = param_.relu.enabled;
+    if (param_.activeParam.type == TYPE_RELU) {
+      inplace_.relu_enable = true;
+    } else if (param_.activeParam.type == TYPE_RELU6) {
+      inplace_.relu6_enable = true;
+    } else if (param_.activeParam.type == TYPE_SIGMOID) {
+      inplace_.sigmoid_enable = true;
+    } else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
+      inplace_.leaky_relu_enable = true;
+    }
+
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
       config_inplace(inplace_);
     }
     bool ret = compute_fpga_dwconv(param_.args) == 0;
-    if (param_.relu.enabled) {
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
       inplace_.relu_enable = false;
+      inplace_.leaky_relu_enable = false;
+      inplace_.relu6_enable = false;
+      inplace_.sigmoid_enable = false;
       config_inplace(inplace_);
     }
     return ret;
diff --git a/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp
index a498a2bde9a3656cf8b7006b867eec088d87b425..6f76ae3d4a1d9d054339d929515f24989f1c15b0 100755
--- a/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp
+++ b/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp
@@ -58,15 +58,29 @@ class ElementwiseAddPE : public PE {
   bool dispatch() {
     param_.inputs[0]->syncToDevice();
     param_.inputs[1]->syncToDevice();
-    InplaceArgs inplace_args = {0};
-    if (param_.relu.enabled) {
-      inplace_args.relu_enable = true;
-      config_inplace(inplace_args);
+    // InplaceArgs inplace_ = {0};
+
+    if (param_.activeParam.type == TYPE_RELU) {
+      inplace_.relu_enable = true;
+    } else if (param_.activeParam.type == TYPE_RELU6) {
+      inplace_.relu6_enable = true;
+    } else if (param_.activeParam.type == TYPE_SIGMOID) {
+      inplace_.sigmoid_enable = true;
+    } else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
+      inplace_.leaky_relu_enable = true;
+    }
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
+      config_inplace(inplace_);
     }
     compute_fpga_ewadd(param_.ewargs);
-    if (param_.relu.enabled) {
-      inplace_args.relu_enable = false;
-      config_inplace(inplace_args);
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
+        inplace_.relu6_enable || inplace_.sigmoid_enable) {
+      inplace_.relu_enable = false;
+      inplace_.relu6_enable = false;
+      inplace_.sigmoid_enable = false;
+      inplace_.leaky_relu_enable = false;
+      config_inplace(inplace_);
     }
     return true;
   }
@@ -75,6 +89,7 @@ class ElementwiseAddPE : public PE {
 
  private:
   ElementwiseAddParam param_;
+  InplaceArgs inplace_ = {0};
 };
 
 }  // namespace zynqmp
diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7730c598b0e8745e47df7d5c456e2b5420fbe6c0
--- /dev/null
+++ b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
@@ -0,0 +1,77 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+namespace paddle {
+namespace zynqmp {
+
+class ElementwiseMulPE : public PE {
+ public:
+  bool init() {
+    Tensor* output = param_.output;
+    output->setAligned(true);
+    output->setDataLocation(Device);
+    return true;
+  }
+
+  void apply() {
+    Tensor* input = param_.input_x;
+    Tensor* output = param_.output;
+
+    int wc_aligned = align_to_x(param_.input_x->shape().numel(), 32);
+
+    Shape s(N, {wc_aligned});
+    float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
+    memset(bias_data, 0, wc_aligned * sizeof(float16));
+
+    ScaleArgs& args = args_;
+    args.scale_address = param_.input_y->data<void>();
+    args.bias_address = bias_tensor.data<void>();
+    args.wc_alignment = wc_aligned;
+    args.channel_alignment = wc_aligned;
+    args.image.address = input->data<void>();
+    args.image.scale_address = input->scale();
+    args.image.channels = wc_aligned;
+    args.image.height = 1;
+    args.image.width = 1;
+    args.image.pad_width = 0;
+    args.image.pad_height = 0;
+    args.output.address = output->data<void>();
+    args.output.scale_address = output->scale();
+  }
+
+  void updateInput(Tensor* t, int index) {
+    if (index == 0) {
+      args_.scale_address = t->data<void>();  // replace inputs?
+    }
+  }
+
+  bool dispatch() {
+    compute_fpga_scale(args_) == 0;
+    return true;
+  }
+
+  ElementwiseMulParam& param() { return param_; }
+
+ private:
+  ElementwiseMulParam param_;
+  ScaleArgs args_ = {0};
+  Tensor bias_tensor;
+};
+
+}  // namespace zynqmp
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
index 2179a142ad3b3a990512b3ea1cd202bc5ce502f1..a2b184e383aa600b1279197a115c58309e204a95 100644
--- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
@@ -38,6 +38,8 @@ class FullyConnectedPE : public PE {
     Tensor* input = param_.input;
     convParam_.input = param_.input;
     convParam_.output = param_.output;
+    // convParam_.relu = param_.relu;
+    convParam_.activeParam.type = param_.activeParam.type;
     convParam_.groups = 1;
     convParam_.strides = {1, 1};
     convParam_.paddings = {0, 0};
@@ -46,6 +48,9 @@ class FullyConnectedPE : public PE {
 
     int num = param_.filter->shape().channel();
     int chw = param_.filter->shape().num();
+    // if (num == 2) {
+    //   return;
+    // }
 
     int height = param_.input->shape().height();
     int width = param_.input->shape().width();
@@ -82,7 +87,45 @@ class FullyConnectedPE : public PE {
     convPE_.apply();
   }
 
-  bool dispatch() { return convPE_.dispatch(); }
+  void cpu_compute() {
+    int num = param_.filter->shape().channel();
+    int chw = param_.filter->shape().num();
+
+    float* filter_data = param_.filter->data<float>();
+    float max = 0.0f;
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    float16* input_data = input->data<float16>();
+    float16* output_data = output->data<float16>();
+
+    for (int i = 0; i < num; i++) {
+      float sum = 0;
+      float bias = param_.bias->data<float>()[i];
+      for (int j = 0; j < chw; j++) {
+        float scale = filter_data[j * num + i];
+        float data = half_to_float(input_data[j]);
+        sum += scale * data;
+      }
+      output_data[i] = float_to_half(sum + bias);
+      if (max < output_data[i]) {
+        max = output_data[i];
+      }
+    }
+
+    output->flush();
+    output->scale()[0] = max / 127.0f;
+    output->scale()[1] = 127.0f / max;
+  }
+
+  bool dispatch() {
+    // int num = param_.filter->shape().channel();
+    // if (num == 2) {
+    //   cpu_compute();
+    //   return 1;
+    // } else {
+    return convPE_.dispatch();
+    // }
+  }
 
   FullyConnectedParam& param() { return param_; }
 
diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..299ffb872b4620fc409eb8e66760a6308a814efb
--- /dev/null
+++ b/lite/backends/fpga/KD/pes/gru_pe.hpp
@@ -0,0 +1,192 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp"
+#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
+#include "lite/backends/fpga/KD/pes/relu_pe.hpp"
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace zynqmp {
+
+struct GRUTensors {
+  Tensor* gate;
+  Tensor* pre_output;
+  Tensor* output;
+  Tensor* reset_output;
+};
+
+class GRUPE : public PE {
+ public:
+  bool init() { return true; }
+
+  void apply() {
+    auto hidden = param_.hidden;
+    int frame_size = hidden->shape().channel();
+
+    zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
+    float16* prev_hidden_data =
+        prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
+    // set previous hidden data to 0;
+    memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
+
+    // copy 2/3 weight from param.weight;
+    zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
+    float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
+    memset(weight_data, 0, weight_shape.numel() * sizeof(float));
+    weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
+    memcpy(weight_data,
+           param_.weight->data<float>(),
+           weight_shape.numel() * sizeof(float));
+
+    Shape gate_shape(zynqmp::NC, {1, frame_size * 2});
+    gate_ping_.mutableData<void>(FP32, gate_shape);
+    gate_pong_.mutableData<void>(FP16, gate_shape);
+
+    zynqmp::FullyConnectedParam& pre_out_param = pre_out_pe_.param();
+    pre_out_param.input = &prev_hidden_;
+    pre_out_param.output = &gate_pong_;
+    pre_out_param.filter = &weight_;
+    pre_out_param.bias = &gate_ping_;
+    pre_out_pe_.init();
+    pre_out_pe_.apply();
+
+    reset_gate_.mutableData<void>(FP16, hidden_shape);
+    prev_hidden_.mutableData<void>(FP16, hidden_shape);
+    reset_hidden_.mutableData<void>(FP16, hidden_shape);
+
+    ElementwiseMulParam& mul_param = mul_pe_.param();
+    // mul_param.inputs = {&reset_gate_, &prev_hidden_};
+    mul_param.input_x = &reset_gate_;
+    mul_param.input_y = &prev_hidden_;
+    mul_param.output = &reset_hidden_;
+    mul_pe_.init();
+    mul_pe_.apply();
+  }
+
+  bool dispatch() { return true; }
+
+  void gru_unit_reset_act(const lite_api::ActivationType active_gate,
+                          GRUTensors& value,  // NOLINT
+                          int frame_size,
+                          int batch_size) {
+    int stride_update = 3 * frame_size;
+    int stride_cell_state = 3 * frame_size;
+    int stride_hidden_prev = frame_size;
+    int stride_hidden = frame_size;
+
+    float* update_gate_data = gate_ping_.data<float>();
+    float* reset_gate_data = update_gate_data + frame_size;
+
+    for (int b = 0; b < batch_size; b++) {
+      Tensor tmp;
+      Shape s(NC, {1, frame_size});
+      float* tmp_data = tmp.mutableData<float>(FP32, s);
+
+      for (int i = 0; i < frame_size; i++) {
+        update_gate_data[i] =
+            lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
+                update_gate_data[i]);
+        reset_gate_data[i] =
+            lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
+                reset_gate_data[i]);
+      }
+      memcpy(tmp_data, reset_gate_data, frame_size * sizeof(float));
+      tmp.flush();
+      reset_gate_.copyFrom(&tmp);
+
+      Tensor* hidden_prev = value.pre_output;
+      if (hidden_prev) {
+        // TODO(chonwhite): change to pre_out;
+        prev_hidden_.copyFrom(value.pre_output);
+      }
+      mul_pe_.dispatch();
+      update_gate_data += stride_update;
+      reset_gate_data += stride_update;
+
+      // reset_hidden_prev += stride_hidden;// TODO
+    }
+  }
+
+  void gru_unit_out_act(const lite_api::ActivationType active_node,
+                        bool origin_mode,
+                        GRUTensors& value,  // NOLINT
+                        int frame_size,
+                        int batch_size) {}
+
+  void copy_input(GRUTensors& value) {  // NOLINT
+    float max = find_max(*(value.gate));
+    gate_ping_.mutableData<void>(FP32, value.gate->shape());
+    gate_ping_.copyFrom(value.gate);
+    // update input pointer?
+  }
+
+  void GRUCOmpute(GRUTensors& value,  // NOLINT
+                  int frame_size,
+                  int batch_size,
+                  const lite_api::ActivationType active_node,
+                  const lite_api::ActivationType active_gate,
+                  bool origin_mode) {
+    copy_input(value);
+
+    if (value.pre_output) {
+      // copy by batch;
+      pre_out_pe_.dispatch();
+      gate_ping_.copyFrom(&gate_pong_);
+    }
+
+    gru_unit_reset_act(active_gate, value, frame_size, batch_size);
+  }
+
+  GRUParam& param() { return param_; }
+
+  Tensor* updateGate() { return &update_gate_; }
+
+  Tensor* resetGate() { return &reset_gate_; }
+
+ private:
+  GRUParam param_;
+  zynqmp::Tensor gate_ping_;
+  zynqmp::Tensor gate_pong_;
+  zynqmp::Tensor bias_;
+  zynqmp::Tensor weight_;
+  zynqmp::Tensor state_weight_;
+  zynqmp::Tensor update_gate_;
+  zynqmp::Tensor reset_gate_;
+  zynqmp::Tensor cell_state_;
+  zynqmp::Tensor prev_hidden_;
+  zynqmp::Tensor reset_hidden_;
+
+  Tensor tempTensor;
+
+  ReluPE update_relu_pe_;
+  ReluPE reset_relu_pe_;
+  zynqmp::ElementwiseMulPE mul_pe_;
+  zynqmp::FullyConnectedPE pre_out_pe_;
+  zynqmp::FullyConnectedPE reset_out_pe_;
+
+  zynqmp::ElementwiseAddPE bias_ew_pe_;
+};
+
+}  // namespace zynqmp
+}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h b/lite/backends/fpga/KD/pes/gru_util.hpp
similarity index 71%
rename from lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h
rename to lite/backends/fpga/KD/pes/gru_util.hpp
index 3c76e0e8b5cf0842cb8d5a613cef7aee3cd13bdb..d49169846f4f18e4d8e30f3658c2173157678f81 100644
--- a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h
+++ b/lite/backends/fpga/KD/pes/gru_util.hpp
@@ -14,13 +14,10 @@
 
 #pragma once
 
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/backends/arm/math/gru_utils.h"
 
-USE_XPU_BRIDGE(relu);
-USE_XPU_BRIDGE(conv2d);
-USE_XPU_BRIDGE(depthwise_conv2d);
-USE_XPU_BRIDGE(elementwise_add);
-USE_XPU_BRIDGE(pool2d);
-USE_XPU_BRIDGE(softmax);
-USE_XPU_BRIDGE(mul);
-USE_XPU_BRIDGE(batch_norm);
+namespace paddle {
+namespace lite {
+namespace fpga {}
+}
+}
diff --git a/lite/backends/fpga/KD/pes/norm_pe.hpp b/lite/backends/fpga/KD/pes/norm_pe.hpp
index 3e2fd8062766c84282233b91fcaecf5e0a26fd72..0537df27e212014ed309245b0e86b8d8f077489e 100644
--- a/lite/backends/fpga/KD/pes/norm_pe.hpp
+++ b/lite/backends/fpga/KD/pes/norm_pe.hpp
@@ -72,8 +72,10 @@ class NormPE : public PE {
     input_float.mutableData<float>(FP32, param_.input->shape());
     float_out.mutableData<float>(FP32, param_.output->shape());
 
+    // param_.input->syncToDevice();
     input_float.copyFrom(param_.input);
     input_float.syncToCPU();
+    // input_float.saveToFile("normalize_", true);
 
     int channel = input_float.shape().channel();
     int height = input_float.shape().height();
@@ -85,6 +87,7 @@ class NormPE : public PE {
     float* out_ptr = float_out.data<float>();
 
     int loop = height * width;
+#pragma omp parallel for
     for (int i = 0; i < loop; i++) {
       float sum = param_.epsilon;
       for (int c = 0; c < channel; c++) {
@@ -98,11 +101,26 @@ class NormPE : public PE {
       }
     }
     float_out.flush();
+    // float_out.saveToFile("normalize_", true);
     param_.output->copyFrom(&float_out);
   }
 
   bool dispatch() {
     cpuCompute();
+    // std::cout << "CPU normalize ---------------------" << std::endl;
+
+    // param_.input->syncToDevice();
+    // // param_.input->saveToFile("normalize_fpga_", true);
+    // config_norm_param(norm_param_args_);
+    // inplace_args_.normalize_enable = true;
+    // config_inplace(inplace_args_);
+
+    // perform_bypass(bypass_args_);
+    // inplace_args_.normalize_enable = false;
+    // config_inplace(inplace_args_);
+    // compute_norm(norm_args_);
+    // param_.output->saveToFile("normalize_fpga_", true);
+    // std::cout << "FPGA normalize ---------------------" << std::endl;
     return true;
   }
 
diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp
old mode 100644
new mode 100755
index 1c99386ab19f485c07723c7fcc8501bdf5556f6c..2944691693b135a2d2df7b91ecbe0ef249b015d8
--- a/lite/backends/fpga/KD/pes/output_pe.hpp
+++ b/lite/backends/fpga/KD/pes/output_pe.hpp
@@ -25,6 +25,8 @@ class OutputPE : public PE {
   bool init() {
     Tensor* output = param_.output;
     output->setAligned(false);
+    DLEngine::get_instance().out_data = reinterpret_cast<float*>(
+        fpga_malloc(output->shape().numel() * sizeof(float)));
     return true;
   }
 
@@ -41,6 +43,15 @@ class OutputPE : public PE {
     } else {
       output->copyFrom(input);
     }
+    //
+    output->syncToCPU();
+    if (DLEngine::get_instance().out_data == nullptr) {
+      DLEngine::get_instance().out_data = reinterpret_cast<float*>(
+          fpga_malloc(output->shape().numel() * sizeof(float)));
+    }
+    memcpy(DLEngine::get_instance().out_data,
+           output->data<void>(),
+           output->shape().numel() * sizeof(float));
     return true;
   }
 
diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp
index fd3be1f463d3bfce925cc4ce5444d119c33e5692..60755ee1dbf81512bde618389cbf3a88cf93d1ce 100644
--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -35,12 +35,17 @@ class PoolingPE : public PE {
     Tensor* input = param_.input;
     Tensor* output = param_.output;
 
-    uint32_t k_width = param_.kernelSize[0];
-    uint32_t k_height = param_.kernelSize[1];
+    uint32_t k_height = 1;
+    uint32_t k_width = 1;
 
     if (param_.globalPooling) {
       k_width = input->shape().width();
       k_height = input->shape().height();
+      param_.kernelSize[0] = k_height;
+      param_.kernelSize[1] = k_width;
+    } else {
+      k_height = param_.kernelSize[0];
+      k_width = param_.kernelSize[1];
     }
 
     PoolingArgs args = {0};
@@ -63,8 +68,12 @@ class PoolingPE : public PE {
     args.out_width = output->shape().width();
     param_.poolingArgs = args;
 
+    // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
+    // &&
+    //            (k_width > 7 || k_height > 7);
     use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
-               (k_width > 7 || k_height > 7);
+               (k_width > 255 || k_height > 255);
+    // use_cpu_ = param_.type == AVERAGE;
   }
 
   void compute() {
@@ -73,6 +82,7 @@ class PoolingPE : public PE {
     input->syncToCPU();
 
     Tensor float_input;
+    // Tensor float_output;
     float* image_addr = float_input.mutableData<float>(FP32, input->shape());
     float_input.copyFrom(input);
     float16* data_out = output->data<float16>();
@@ -107,6 +117,8 @@ class PoolingPE : public PE {
         for (int c = 0; c < image_channels; ++c) {
           const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
           float sum = 0;
+          // const int index =
+          //     (hstart * image_width + wstart) * image_channels + c;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
               const int index = (h * image_width + w) * image_channels + c;
@@ -127,7 +139,7 @@ class PoolingPE : public PE {
     output->flush();
   }
 
-  void cpu_compute() {
+  void cpu_compute1() {
     Tensor* input = param_.input;
     Tensor* output = param_.output;
     input->syncToCPU();
@@ -135,6 +147,7 @@ class PoolingPE : public PE {
     Tensor float_input;
     float_input.mutableData<float>(FP32, input->shape());
     float_input.copyFrom(input);
+    // float_input.saveToFile("pool_float.txt");
     float16* data_out = output->data<float16>();
 
     int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
@@ -152,13 +165,45 @@ class PoolingPE : public PE {
     }
     output->scale()[0] = scale_max / 127.0f;
     output->scale()[1] = 127.0f / scale_max;
-    std::cout << "pool scale:" << scale_max / 127.0f << std::endl;
     output->flush();
+    // exit(-1);
+  }
+
+  void cpu_compute() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    input->syncToCPU();
+
+    Tensor float_input;
+    float* float_input_data =
+        float_input.mutableData<float>(FP32, input->shape());
+    float_input.copyFrom(input);
+
+    float16* data_out = output->data<float16>();
+
+    int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
+
+    float scale_max = 0;
+    for (int i = 0; i < output->shape().channel(); i++) {
+      float sum = 0;
+      for (int j = 0; j < kernel_hw; j++) {
+        sum += float_input_data[i * kernel_hw + j];
+      }
+      float value = sum / kernel_hw;
+      data_out[i] = float_to_half(value);
+      scale_max = std::max(scale_max, std::abs(value));
+    }
+    output->scale()[0] = scale_max / 127.0f;
+    output->scale()[1] = 127.0f / scale_max;
+    output->flush();
+    // exit(-1);
   }
 
   bool dispatch() {
     if (use_cpu_) {
+      // cpu_compute();
       compute();
+      // exit(-1);
       return true;
     }
     param_.input->syncToDevice();
diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
index d6a503a31d4e0736724740ce1875c916969d93e0..00dfe1830f6f44cbf6a30708fa5783563470c686 100644
--- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
@@ -253,9 +253,8 @@ bool PriorBoxPE::dispatch() {
   if (cachedBoxes_ == nullptr) {
     cachedBoxes_ = new Tensor();
     cachedVariances_ = new Tensor();
-    cachedBoxes_->mutableData<float16>(FP16, param_.outputBoxes->shape());
-    cachedVariances_->mutableData<float16>(FP16,
-                                           param_.outputVariances->shape());
+    cachedBoxes_->mutableData<float>(FP32, param_.outputBoxes->shape());
+    cachedVariances_->mutableData<float>(FP32, param_.outputVariances->shape());
     cachedBoxes_->setDataLocation(CPU);
     cachedVariances_->setDataLocation(CPU);
     compute_prior_box();
diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp
index d5e16615d9943a1771dfabe916433768ecf16319..09755c65a322da8ccab0d57dd2e877712b112361 100755
--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -14,11 +14,16 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+
 #include "lite/backends/fpga/KD/pe.hpp"
 #include "lite/backends/fpga/KD/pe_params.hpp"
+#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
+#include "lite/backends/fpga/KD/tensor.hpp"
 
 namespace paddle {
 namespace zynqmp {
+
 class ScalePE : public PE {
  public:
   inline int gcd(int a, int b) {
@@ -42,6 +47,8 @@ class ScalePE : public PE {
     Tensor* input = param_.input;
     Tensor* output = param_.output;
     Shape& input_shape = input->shape();
+    DepthwiseConvParam& dw_param = dw_pe_.param();
+
     int channel = input_shape.channel();
     int repeat = 1;
     int alignment = 16;
@@ -51,70 +58,141 @@ class ScalePE : public PE {
       int c_lcm = lcm(channel, alignment);
       repeat = c_lcm / (channel);
     }
+
+    // FPGA限制 H >2047, W >1023 , WC> 65536 ，需要使用CPU实现
     Shape shape(N, {channel * repeat});
-    param_.alignedBias()->mutableData<float16>(FP16, shape);
-    param_.alignedScale()->mutableData<float16>(FP16, shape);
 
-    float16* bias_data = param_.alignedBias()->data<float16>();
-    float16* scale_data = param_.alignedScale()->data<float16>();
+    float* filter_data = filter.mutableData<float>(FP32, shape);
+    std::fill_n(filter_data, input->shape().channel(), 1.0f);
 
-    if (param_.bias != nullptr) {
-      float* bias_data_float = param_.bias->data<float>();
+    Tensor* scale = dw_param.scale();
+    float16* scale_data = scale->mutableData<float16>(FP16, shape);
+
+    Tensor* bias = dw_param.bias();
+    float16* bias_data = bias->mutableData<float16>(FP16, shape);
+    std::fill_n(bias_data, input->shape().channel(), 0);
+
+    if (param_.scale->dataType() == FP32) {
+      if (param_.bias != nullptr) {
+        float* bias_data_float = param_.bias->data<float>();
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            float16 value = float_to_half(bias_data_float[j]);
+            bias_data[i * length + j] = value;
+          }
+        }
+      } else {
+        float16 zero = float_to_half(0.0f);
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            bias_data[i * length + j] = zero;
+          }
+        }
+      }
+      float* scale_data_float = param_.scale->data<float>();
       for (int i = 0; i < repeat; i++) {
         for (int j = 0; j < length; j++) {
-          float16 value = float_to_half(bias_data_float[j]);
-          bias_data[i * length + j] = value;
+          float16 value = float_to_half(scale_data_float[j]);
+          scale_data[i * length + j] = value;
         }
       }
     } else {
-      float16 zero = float_to_half(0.0f);
+      if (param_.bias != nullptr) {
+        float16* bias_data_float = param_.bias->data<float16>();
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            float16 value = bias_data_float[j];
+            bias_data[i * length + j] = value;
+          }
+        }
+      } else {
+        float16 zero = float_to_half(0.0f);
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            bias_data[i * length + j] = zero;
+          }
+        }
+      }
+
+      float16* scale_data_float = param_.scale->data<float16>();
       for (int i = 0; i < repeat; i++) {
         for (int j = 0; j < length; j++) {
-          bias_data[i * length + j] = zero;
+          float16 value = scale_data_float[j];
+          scale_data[i * length + j] = value;
         }
       }
     }
 
-    float* scale_data_float = param_.scale->data<float>();
-    for (int i = 0; i < repeat; i++) {
-      for (int j = 0; j < length; j++) {
-        float16 value = float_to_half(scale_data_float[j]);
-        scale_data[i * length + j] = value;
+    dw_param.input = param_.input;
+    dw_param.output = param_.output;
+    dw_param.filter = &filter;
+
+    dw_param.strides = {1, 1};
+    dw_param.paddings = {0, 0};
+    dw_param.kernelSize = {1, 1};
+    dw_param.dilations = {1, 1};
+
+    dw_pe_.init();
+    dw_pe_.apply();
+  }
+
+  void cpu_compute() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    Tensor float_input;
+    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
+    input->syncToCPU();
+    float_input.copyFrom(input);
+    float16* data_out = output->data<float16>();
+
+    float* scale_data = param_.scale->data<float>();
+
+    int wh = input->shape().width() * input->shape().height();
+
+    float16* in_data = input->data<float16>();
+
+    float max = 0;
+
+    for (int i = 0; i < wh; i++) {
+      for (int c = 0; c < input->shape().channel(); c++) {
+        int index = i * input->shape().channel() + c;
+        float value = half_to_float(in_data[index]) * scale_data[c];
+        data_out[index] = float_to_half(value);
+
+        if (value < 0) {
+          value = -value;
+        }
+        if (value > max) {
+          max = value;
+        }
       }
     }
-
-    param_.alignedScale()->flush();
-    param_.alignedBias()->flush();
-
-    int wc = input_shape.width() * input_shape.channel();
-    int wc_aligned = align_image(wc);
-
-    ScaleArgs& args = param_.args;
-    args.scale_address = param_.alignedScale()->data<void>();
-    args.bias_address = param_.alignedBias()->data<void>();
-    args.wc_alignment = wc_aligned;
-    args.channel_alignment = channel * repeat;
-
-    args.image.address = input->data<void>();
-    args.image.scale_address = input->scale();
-    args.image.channels = channel;
-    args.image.height = input_shape.height();
-    args.image.width = input_shape.width();
-    args.image.pad_width = 0;
-    args.image.pad_height = 0;
-    args.output.address = output->data<void>();
-    args.output.scale_address = output->scale();
+    output->flush();
+    output->scale()[0] = max / 127.0f;
+    output->scale()[1] = 127.0f / max;
   }
 
   bool dispatch() {
+    if (param_.scale->dataType() == FP16) {
+      DepthwiseConvParam& dw_param = dw_pe_.param();
+      memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
+             param_.scale->data<float16>(),
+             param_.scale->shape().numel() * sizeof(float16));
+      dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
+      dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
+
+      dw_param.quantizedFilter()->flush();
+    }
     param_.input->syncToDevice();
-    return compute_fpga_scale(param_.args) == 0;
+    return dw_pe_.dispatch();
   }
 
   ScaleParam& param() { return param_; }
 
  private:
   ScaleParam param_;
+  Tensor filter;
+  DepthwiseConvPE dw_pe_;
 };
 }  // namespace zynqmp
 }  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/split_pe.hpp b/lite/backends/fpga/KD/pes/split_pe.hpp
index 26598a4c87f0b88882b3fe76de64ddfa5c6cd6a8..01a036787441c596bf74858aa9bf6a6613864cc1 100644
--- a/lite/backends/fpga/KD/pes/split_pe.hpp
+++ b/lite/backends/fpga/KD/pes/split_pe.hpp
@@ -53,20 +53,37 @@ class SplitPE : public PE {
     int64_t src_after = src_stride_numel[axis];
     int64_t dst_after = dst_stride_numel[axis];
 
+    // PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(),
+    //                       "src and dst tensor should have the same dims
+    //                       size.");
+
     for (int64_t i = 0; i < axis; ++i) {
       if (i < axis) {
+        // PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] ==
+        //                           dst_stride_numel[i] /
+        //                           dst_stride_numel[axis],
+        //                       "src and dst should have the same elements "
+        //                       "except the specified axis.");
       } else if (i == axis) {
         continue;
       } else {
+        // PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i],
+        //                       "src and dst should have the same elements "
+        //                       "except the specified axis.");
       }
     }
 
     for (int64_t i = 0; i < before; ++i) {
-      memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
+      memcpy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
     }
   }
 
-  void split3D() { int axis = param_.axis; }
+  void split3D() {
+    int axis = param_.axis;
+    // float16* dst = param_.output->data<float16>();
+    // std::vector<int>& dst_dims = ;
+    // StridedNumelCopyWithAxis();
+  }
 
   bool dispatch() {
     Tensor* input = param_.input;
@@ -88,6 +105,7 @@ class SplitPE : public PE {
                                           in_stride,
                                           out_stride[axis]);
         input_offset += out_stride[axis];
+        // out->flush();
       }
       return true;
     }
@@ -95,21 +113,26 @@ class SplitPE : public PE {
     std::vector<Tensor*> outputs = param_.outputs;
 
     int in_channel = input->shape().channel();
-    int split_channel = input->shape().channel() / param_.num;
+    // int split_channel = input->shape().channel() / param_.num;
     int hw = input->shape().height() * input->shape().width();
 
     float16* in_data = input->data<float16>();
+
     for (int i = 0; i < hw; i++) {
+      int channel_stride = 0;
       for (int n = 0; n < outputs.size(); n++) {
         Tensor* out = outputs[n];
         float16* out_data = out->data<float16>();
-        memcpy(out_data + i * split_channel,
-               in_data + i * in_channel + n * split_channel,
-               split_channel * sizeof(float16));
+        memcpy(out_data + i * out->shape().channel(),
+               in_data + i * in_channel + channel_stride,
+               out->shape().channel() * sizeof(float16));
+        channel_stride += out->shape().channel();
       }
     }
+
     for (int n = 0; n < outputs.size(); n++) {
       Tensor* out = outputs[n];
+      out->flush();
       out->copyScaleFrom(input);
     }
     return true;
@@ -120,5 +143,6 @@ class SplitPE : public PE {
  private:
   SplitParam param_;
 };
+
 }  // namespace zynqmp
 }  // namespace paddle
diff --git a/lite/backends/fpga/KD/shape.hpp b/lite/backends/fpga/KD/shape.hpp
index 566ad8e6ff2eff32301e83b6cdb5b1addd0117fe..c25c3315145137a147928a164fcabd2923b09e87 100755
--- a/lite/backends/fpga/KD/shape.hpp
+++ b/lite/backends/fpga/KD/shape.hpp
@@ -23,6 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace zynqmp {
 
+static struct None none_;
 static struct NCHW nchw_;
 static struct NHWC nhwc_;
 static struct NC nc_;
@@ -82,6 +83,9 @@ class Shape {
   void setLayoutType(LayoutType layout) {
     this->layoutType_ = layout;
     switch (layout) {
+      case None:
+        layout_ = &none_;
+        break;
       case NCHW:
         layout_ = &nchw_;
         break;
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
index f003ded33eb51136ae0ae0a2c21988460232f89a..988bc1bb507036de8f13a6c6549c549718bd1256 100644
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdio.h>
+#include <unistd.h>
 #include <algorithm>
 #include <cmath>
 #include <cstring>
@@ -24,13 +25,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-// #include "lite/core/tensor.h"
-
 #include "lite/backends/fpga/KD/dl_engine.hpp"
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 #include "lite/backends/fpga/KD/shape.hpp"
-// #include "lite/backends/fpga/KD/types.hpp"
 
 namespace paddle {
 namespace zynqmp {
@@ -117,7 +115,8 @@ class Tensor {
 
   template <typename Dtype>
   Dtype* mutableData() {
-    size_t memorySize = shape_->memorySize(CellSize(dataType_));
+    size_t memorySize =
+        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
     if (placeHolder_ != nullptr) {
       if (memorySize > placeHolder_->memorySize()) {
         placeHolder_.reset(new PlaceHolder(memorySize));
@@ -241,6 +240,10 @@ class Tensor {
     }
   }
 
+  void setMemScale(float scale_factor) {
+    this->mem_scale_factor_ = scale_factor;
+  }
+
   void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
 
   void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
@@ -276,9 +279,11 @@ class Tensor {
                   .height = 1,
                   .pad_width = 0u,
                   .pad_height = 0u};
-    args.output = {
+
+    ImageOutputArgs output = {
         .address = data<void>(), .scale_address = scale(),
     };
+    args.output = output;
     src->syncToDevice();
     size_t aligned_remainder = src->shape().numel() % 16;
     if (aligned_remainder > 0) {
@@ -294,10 +299,14 @@ class Tensor {
     this->invalidate();
   }
 
-  void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
+  void flush() {
+    size_t memorySize = placeHolder_->memorySize();
+    fpga_flush(placeHolder_->data(), memorySize);
+  }
 
   void invalidate() {
-    fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
+    size_t memorySize = placeHolder_->memorySize();
+    fpga_invalidate(placeHolder_->data(), memorySize);
   }
 
   void sync() {
@@ -339,6 +348,8 @@ class Tensor {
     }
   }
 
+  void printScale(std::string type) { printScale(); }
+
   std::string dimsFileName() {
     return std::to_string(shape_->num()) + "_" +
            std::to_string(shape_->channel()) + "_" +
@@ -358,29 +369,9 @@ class Tensor {
     saveToFile(path);
   }
 
-  friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
-    os << "tensor:"
-       << "\n";
-    os << "dims: {";
-    for (int i = 0; i < tensor.shape().dimSize(); ++i) {
-      os << tensor.shape()[i] << " ";
-    }
-    os << "}\n";
-    for (int i = 0; i < tensor.shape().numel(); i++) {
-      float value = 0;
-      if (tensor.dataType() == FP32) {
-        value = tensor.data<float>()[i];
-      } else {
-        value = half_to_float(tensor.data<float16>()[i]);
-      }
-      os << value << " ";
-    }
-    os << "\n";
-    return os;
-  }
-
   void saveToFile(std::string path) {
     syncToCPU();
+    invalidate();
     std::ofstream ofs;
     static int counter = 0;
     std::string npath = std::to_string(counter) + "_" + path;
@@ -389,17 +380,19 @@ class Tensor {
   }
 
   void save_file_with_name(std::string path) {
-    // return;
     invalidate();
     std::ofstream ofs;
-
     ofs.open(path);
+    ofs << scale()[0] << " / " << scale()[1] << std::endl;
+
     for (int i = 0; i < shape_->numel(); i++) {
       float value = 0;
       if (dataType_ == FP32) {
         value = data<float>()[i];
-      } else {
+      } else if (dataType_ == FP16) {
         value = half_to_float(data<float16>()[i]);
+      } else {
+        value = data<int8_t>()[i];
       }
       ofs << value << std::endl;
     }
@@ -415,18 +408,49 @@ class Tensor {
     int num = shape_->numel();
     invalidate();
     float max = 0.0f;
-    float16* data = mutableData<float16>();
-    for (int i = 0; i < num; ++i) {
-      float value = 0;
-      file_stream >> value;
-      max = std::max(std::abs(value), max);
-      data[i] = float_to_half(value);
+    if (dataType_ == FP16) {
+      float16* data = mutableData<float16>();
+      for (int i = 0; i < num; ++i) {
+        float value = 0;
+        file_stream >> value;
+        max = std::max(std::abs(value), max);
+        data[i] = float_to_half(value);
+      }
+    } else {
+      float* data = mutableData<float>();
+      for (int i = 0; i < num; ++i) {
+        float value = 0;
+        file_stream >> value;
+        max = std::max(std::abs(value), max);
+        data[i] = value;
+      }
     }
     flush();
     placeHolder_->scale_[0] = max / 127.0f;
     placeHolder_->scale_[1] = 127.0f / max;
   }
 
+  friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
+    os << "tensor:"
+       << "\n";
+    os << "dims: {";
+    for (int i = 0; i < tensor.shape().dimSize(); ++i) {
+      os << tensor.shape()[i] << " ";
+    }
+    os << "}\n";
+    for (int i = 0; i < tensor.shape().numel(); i++) {
+      float value = 0;
+      if (tensor.dataType() == FP32) {
+        value = tensor.data<float>()[i];
+      } else {
+        value = half_to_float(tensor.data<float16>()[i]);
+      }
+      os << value << " ";
+    }
+    os << "\n";
+    return os;
+  }
+
   ~Tensor() {
     if (shape_ != nullptr) {
       delete shape_;
@@ -436,6 +460,7 @@ class Tensor {
 
  private:
   int offset = 0;
+  float mem_scale_factor_ = 1.0f;
   std::shared_ptr<PlaceHolder> placeHolder_;
   Shape* shape_ = nullptr;
   DataType dataType_ = FP32;
diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc
old mode 100644
new mode 100755
index 43218173fd05626fb46495bb254b250c14e5417a..7f1e8d3e17f97315e77532b77bbcfcc8331edd4f
--- a/lite/backends/fpga/lite_tensor.cc
+++ b/lite/backends/fpga/lite_tensor.cc
@@ -95,16 +95,14 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
   dims_ = other.dims_;
   target_ = other.target_;
   lod_ = other.lod_;
-  // memory_size_ = other.memory_size_;
-  // buffer_->CopyDataFrom(*other.buffer_, memory_size_);
-  zynq_tensor_->mutableData<void>(other.zynq_tensor_->dataType(),
-                                  other.zynq_tensor_->shape());
-}
+  auto dt = zynq_tensor_->dataType();
 
-// template <typename T>
-// void TensorLite::mutable_data_internal() {
+  auto shape = other.zynq_tensor_->shape();
 
-// }
+  Resize(other.dims());
+  zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
+  this->ZynqTensor()->copyFrom(other.ZynqTensor());
+}
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h
index 2f9df3abb08dd15641323f4a3c59d6175f2e481b..266e0b5ce0ea03108978c3b0a32fbf0e3872c83c 100644
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -106,7 +106,7 @@ class TensorLite {
   // For other devices, T and R may be the same type.
   template <typename T, typename R = T>
   const R *data() const {
-    return zynq_tensor_->data<R>();
+    return zynq_tensor_->data<R>() + offset_;
   }
 
   void Resize(const DDimLite &ddim) { dims_ = ddim; }
@@ -125,6 +125,7 @@ class TensorLite {
 
   bool persistable() const { return persistable_; }
   void set_persistable(bool persistable) { persistable_ = persistable; }
+
   // T is the data type and R is the return type
   // For OpenCL, the return type can be cl::Buffer
   // and the data type can be float/int8_t.
@@ -147,7 +148,13 @@ class TensorLite {
 
   size_t memory_size() const { return zynq_tensor_->memorySize(); }
 
+  size_t offset() const { return offset_; }
+
   bool IsInitialized() const { return buffer_->data(); }
+  void clear() {
+    buffer_->Free();
+    offset_ = 0;
+  }
 
   // Other share data to this.
   void ShareDataWith(const TensorLite &other);
@@ -157,6 +164,9 @@ class TensorLite {
   template <typename T>
   TensorLite Slice(int64_t begin, int64_t end) const;
 
+  template <typename T>
+  void Slice(TensorLite &dst, int64_t begin, int64_t end) const;  // NOLINT
+
   TargetType target() const { return target_; }
 
   zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
@@ -173,16 +183,21 @@ class TensorLite {
 
  private:
   TargetType target_{TargetType::kHost};
+
+  // precision_ and persistable_ are only used for persistable vars.
+  // If your tensor wants to be saved and loaded correctly, you must
+  // set values of precision_ and persistable_ after updating it.
+  // If your tensor is just a temp tensor, such as activations,
+  // you can ignore these two attributes.
+  PrecisionType precision_{PrecisionType::kUnk};
+  bool persistable_{false};
+
   DDimLite dims_;
   std::shared_ptr<Buffer> buffer_;
   LoD lod_;
   size_t memory_size_{};
-
   size_t offset_{0};
 
-  PrecisionType precision_{PrecisionType::kUnk};
-  bool persistable_{false};
-
   zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
 
   template <typename T>
@@ -197,6 +212,9 @@ R *TensorLite::mutable_data() {
   }
   zynqmp::LayoutType layout_type = zynqmp::NCHW;
   switch (v.size()) {
+    case 0:
+      layout_type = zynqmp::None;
+      break;
     case 1:
       layout_type = zynqmp::N;
       break;
@@ -228,24 +246,60 @@ R *TensorLite::mutable_data(TargetType target) {
   return mutable_data<T>();
 }
 
-template <typename TensorT>
-bool TensorCompareWith(const TensorT &a, const TensorT &b) {
-  if (a.dims() != b.dims()) return false;
-  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
-  return true;
-}
 template <typename T>
 TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
-  int64_t base = numel() / dims_[0];
+  throw - 1;
+  CHECK_GE(begin, 0);
+  CHECK_LE(end, dims_[0]);
+  CHECK_LT(begin, end);
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    int64_t base = numel() / dims_[0];
+
+    TensorLite dst;
+    dst.target_ = target_;
+    auto dst_dims = dims_;
+    dst_dims[0] = end - begin;
+    dst.Resize(dst_dims);
+    void *dst_data = dst.mutable_data<T>();
+
+    T *src_data = const_cast<T *>(data<T>());
+    memcpy(dst_data,
+           src_data + static_cast<size_t>(begin * base) * sizeof(T),
+           dst_dims.production() * sizeof(T));
+    dst.ZynqTensor()->saveToFile("_slice", true);
+
+    return dst;
+  }
+}
+
+template <typename T>
+void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
+  CHECK_GE(begin, 0);
+  CHECK_LE(end, dims_[0]);
+  CHECK_LT(begin, end);
 
-  TensorLite dst;
-  dst.buffer_ = buffer_;
   dst.target_ = target_;
   auto dst_dims = dims_;
   dst_dims[0] = end - begin;
   dst.Resize(dst_dims);
-  dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
-  return dst;
+  void *dst_data = dst.mutable_data<T>();
+
+  int64_t base = numel() / dims_[0];
+
+  T *src_data = const_cast<T *>(data<T>());
+  memcpy(dst_data,
+         src_data + static_cast<size_t>(begin * dst_dims.production()),
+         dst_dims.production() * sizeof(T));
 }
+
+template <typename TensorT>
+bool TensorCompareWith(const TensorT &a, const TensorT &b) {
+  if (a.dims() != b.dims()) return false;
+  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
+  return true;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/npu/CMakeLists.txt b/lite/backends/npu/CMakeLists.txt
index 426ff5698146c773c818b2bfd598d6bbbdf7867f..1540741d331097961dcf7cd791c9785a9c53ddd1 100644
--- a/lite/backends/npu/CMakeLists.txt
+++ b/lite/backends/npu/CMakeLists.txt
@@ -2,5 +2,4 @@ if(NOT LITE_WITH_NPU)
   return()
 endif()
 
-lite_cc_library(npu_runtime SRCS runtime.cc DEPS ${npu_runtime_libs})
-lite_cc_library(npu_builder SRCS builder.cc DEPS ${npu_builder_libs} npu_runtime tensor op scope)
+lite_cc_library(device_npu SRCS device.cc DEPS ${npu_builder_libs} ${npu_runtime_libs})
diff --git a/lite/backends/npu/builder.h b/lite/backends/npu/builder.h
deleted file mode 100644
index 70200354fbab15f043a537300e92e2a26a3d739e..0000000000000000000000000000000000000000
--- a/lite/backends/npu/builder.h
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "ai_ddk_lib/include/hiai_ir_build.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/tensor.h"
-
-// Extended Ops of HIAI DDK
-namespace ge {
-/**
- * Pads a tensor.
- * <Input>
- *      x : the input tensor
- *      padding : the input tensor must be 2-D
- *      constant_values : constant values must be a scalar
- * <Output>
- *      output : the output tensor
- * <Attr>
- *      t_paddings : Default DT_INT32 , t_paddings must be  the same with
- * datatype of the padding
- *      mode : 0: CONSTANT, 1: REFLECT, 2: SYMMETRIC
- *      T  :  datatype of constant_values  DT_INT32:3   DT_FLOAT:0
- */
-REG_OP(Pad)
-    .INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
-    .INPUT(padding, TensorType({DT_INT32}))
-    .OPTIONAL_INPUT(constant_values, TensorType({DT_INT32, DT_FLOAT}))
-    .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32}))
-    .ATTR(t_paddings, AttrValue::INT{3})
-    .ATTR(mode, AttrValue::INT{0})
-    .REQUIRED_ATTR(T, AttrValue::INT)
-    .OP_END();
-
-}  // namespace ge
-
-namespace paddle {
-namespace lite {
-namespace npu {
-
-class OpList {
- public:
-  static OpList& Global() {
-    static thread_local OpList x;
-    return x;
-  }
-  void clear() { lists_.clear(); }
-  void add(std::shared_ptr<ge::Operator> p) { lists_.push_back(p); }
-
- private:
-  std::vector<std::shared_ptr<ge::Operator>> lists_;
-};
-
-// Build HIAI IR graph to om model, and store om model data into lite tensor
-bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
-                std::vector<ge::Operator>& outputs,  // NOLINT
-                lite::Tensor* model_data);
-
-std::string UniqueName(const std::string& prefix);
-
-ge::DataType CvtPrecisionType(PrecisionType itype);
-
-ge::Format CvtDataLayoutType(DataLayoutType itype);
-
-ge::TensorPtr CvtTensor(Tensor* in_tensor,
-                        std::vector<int64_t> out_shape = {},
-                        PrecisionType in_ptype = PRECISION(kFloat),
-                        DataLayoutType in_ltype = DATALAYOUT(kNCHW));
-
-template <typename T>
-ge::TensorPtr CreateTensorAndFillData(std::vector<T> data,
-                                      std::vector<int64_t> shape = {},
-                                      ge::Format format = ge::FORMAT_NCHW) {
-  const std::type_info& info = typeid(T);
-  ge::DataType type = ge::DT_FLOAT;
-  if (info == typeid(float)) {
-    type = ge::DT_FLOAT;
-  } else if (info == typeid(int8_t)) {
-    type = ge::DT_INT8;
-  } else if (info == typeid(int32_t)) {
-    type = ge::DT_INT32;
-  } else {
-    LOG(FATAL) << "[NPU] Unknow value type " << info.name();
-  }
-  if (shape.empty()) {
-    shape = {static_cast<int64_t>(data.size())};
-  } else {
-    int size = 1;
-    for (auto i : shape) {
-      size *= i;
-    }
-    CHECK_EQ(data.size(), size);
-  }
-  ge::TensorDesc desc(ge::Shape(shape), format, type);
-  ge::TensorPtr tensor = std::make_shared<ge::Tensor>();
-  tensor->SetTensorDesc(desc);
-  tensor->SetData(reinterpret_cast<uint8_t*>(data.data()),
-                  data.size() * sizeof(T));
-  return tensor;
-}
-
-template <typename T>
-ge::TensorPtr CreateTensorAndFillData(T value,
-                                      std::vector<int64_t> shape = {1},
-                                      ge::Format format = ge::FORMAT_NCHW) {
-  int64_t size = 1;
-  for (auto i : shape) {
-    size *= i;
-  }
-  std::vector<T> data(size, value);
-  return CreateTensorAndFillData(data, shape, format);
-}
-
-int CvtActMode(std::string act_type);
-
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname);
-
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d62ac9cad3e5ab4e6f63e3b667e3fa93e244fec1
--- /dev/null
+++ b/lite/backends/npu/device.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/npu/device.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace npu {
+
+std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
+    std::string& model_name,                 // NOLINT
+    std::vector<ge::Operator>& input_nodes,  // NOLINT
+    std::vector<ge::Operator>& output_nodes  // NOLINT
+    ) {
+  VLOG(3) << "[NPU] Build model";
+  // Build the HiAI IR graph to the HiAI om model
+  ge::Graph ir_graph("graph");
+  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
+  ge::Model om_model("model", "model");
+  om_model.SetGraph(ir_graph);
+  domi::HiaiIrBuild ir_build;
+  domi::ModelBufferData om_model_buf;
+  if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
+    LOG(WARNING) << "[NPU] CreateModelBuff failed!";
+    return nullptr;
+  }
+  if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
+    LOG(WARNING) << "[NPU] BuildIRModel failed!";
+    ir_build.ReleaseModelBuff(om_model_buf);
+    return nullptr;
+  }
+  // Create a HiAI model manager client to load the HiAI om model
+  std::unique_ptr<hiai::AiModelMngerClient> model_client(
+      new hiai::AiModelMngerClient());
+  if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
+    LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
+    ir_build.ReleaseModelBuff(om_model_buf);
+    return nullptr;
+  }
+  model_name = "model_" + std::to_string(model_count_++) + ".om";
+  auto model_desc = std::make_shared<hiai::AiModelDescription>(
+      model_name, freq_level(), framework_type(), model_type(), device_type());
+  model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
+  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs;
+  model_descs.push_back(model_desc);
+  if (model_client->Load(model_descs) != hiai::AI_SUCCESS) {
+    LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
+    ir_build.ReleaseModelBuff(om_model_buf);
+    return nullptr;
+  }
+  ir_build.ReleaseModelBuff(om_model_buf);
+  VLOG(3) << "[NPU] Build done";
+  return model_client;
+}
+
+}  // namespace npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/npu/runtime.h b/lite/backends/npu/device.h
similarity index 63%
rename from lite/backends/npu/runtime.h
rename to lite/backends/npu/device.h
index 8b1ad51518d8626d9a6ecd6203a70b2637bb6004..411600ae0a38e4ee1b4a3ce3d6519b927eeb0a1a 100644
--- a/lite/backends/npu/runtime.h
+++ b/lite/backends/npu/device.h
@@ -13,38 +13,47 @@
 // limitations under the License.
 
 #pragma once
+
 #include <memory>
 #include <string>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "lite/core/tensor.h"
+#include <unordered_map>
+#include <vector>
+#include "HiAiModelManagerService.h"  // NOLINT
+#include "hiai_ir_build.h"            // NOLINT
 
 namespace paddle {
 namespace lite {
 namespace npu {
 
-class DeviceInfo {
+class Device {
  public:
-  static DeviceInfo &Global() {
-    static DeviceInfo x;
+  static Device& Global() {
+    static Device x;
     return x;
   }
-  DeviceInfo() {}
+  Device() {}
 
   int freq_level() { return freq_level_; }
   int framework_type() { return framework_type_; }
   int model_type() { return model_type_; }
   int device_type() { return device_type_; }
 
+  // Build the HiAI IR graph to om model, return HiAI model manager client to
+  // load om model and run inference.
+  std::unique_ptr<hiai::AiModelMngerClient> Build(
+      std::string& model_name,                 // NOLINT
+      std::vector<ge::Operator>& input_nodes,  // NOLINT
+      std::vector<ge::Operator>& output_nodes  // NOLINT
+      );                                       // NOLINT
+
  private:
   int freq_level_{3};
   int framework_type_{0};
   int model_type_{0};
   int device_type_{0};
+  int model_count_{0};
 };
 
-bool LoadModel(const lite::Tensor &model_data,
-               std::shared_ptr<hiai::AiModelMngerClient> *model_client,
-               std::string *model_name);
 }  // namespace npu
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/npu/runtime.cc b/lite/backends/npu/runtime.cc
deleted file mode 100644
index 3485f63c7c8bb91081fd1969d0d41733417149d9..0000000000000000000000000000000000000000
--- a/lite/backends/npu/runtime.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/npu/runtime.h"
-#include <string>
-#include <vector>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-
-// Create hiai model manager to load om model from lite tensor, and return the
-// manager and an unique model name
-bool LoadModel(const lite::Tensor &model_data,
-               std::shared_ptr<hiai::AiModelMngerClient> *model_client,
-               std::string *model_name) {
-  LOG(INFO) << "[NPU] Load model.";
-  auto model_data_ptr = model_data.data<int8_t>();
-  auto model_data_size = model_data.numel() * sizeof(int8_t);
-  if (model_data_ptr == nullptr || model_data_size == 0) {
-    return false;
-  }
-  *model_client = std::make_shared<hiai::AiModelMngerClient>();
-  int ret = (*model_client)->Init(nullptr);
-  if (ret != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] AiModelMngerClient init failed(" << ret << ")!";
-    return false;
-  }
-  *model_name = "model.om";
-  auto model_desc = std::make_shared<hiai::AiModelDescription>(
-      *model_name,
-      DeviceInfo::Global().freq_level(),
-      DeviceInfo::Global().framework_type(),
-      DeviceInfo::Global().model_type(),
-      DeviceInfo::Global().device_type());
-  model_desc->SetModelBuffer(model_data_ptr, model_data_size);
-  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs;
-  model_descs.push_back(model_desc);
-  if ((*model_client)->Load(model_descs) != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
-    return false;
-  }
-  return true;
-}
-
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/CMakeLists.txt b/lite/backends/opencl/CMakeLists.txt
index 1acb98321844191832fd55b640a9b56d3d51b400..dd7f6b417e0d6416eec9bb3e60ef088432776112 100644
--- a/lite/backends/opencl/CMakeLists.txt
+++ b/lite/backends/opencl/CMakeLists.txt
@@ -11,8 +11,8 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt
 lite_cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_context cl_image)
 lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
 lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 add_dependencies(cl_wrapper opencl_clhpp)
diff --git a/lite/backends/opencl/cl_caller.cc b/lite/backends/opencl/cl_caller.cc
index 4926a53c43d54b4e2b4d802a7d8ef289c7e87fc5..6b9cab1056beaa6f516a0d3a202a7816c911f1b2 100644
--- a/lite/backends/opencl/cl_caller.cc
+++ b/lite/backends/opencl/cl_caller.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace lite {
+
 static void CopyImageData(CLContext* context,
                           const CLImage& cl_image,
                           float* out) {
@@ -51,119 +52,5 @@ bool InitOpenCLRuntime(std::string cl_path) {
   return runtime->IsInitSuccess();
 }
 
-void elementwise_add(CLContext* context,
-                     const float* in,
-                     const DDim& in_dim,
-                     const float* bias,
-                     const DDim& bias_dim,
-                     float* out,
-                     const DDim& out_dim) {
-  if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) {
-    LOG(FATAL) << "Error: bias dims is error";
-    return;
-  }
-  auto kernel = bias_dim.size() == 1 ? context->GetKernel("channel_add")
-                                     : context->GetKernel("elementwise_add");
-  CLImage in_image;
-  in_image.set_tensor_data(in, in_dim);
-  in_image.InitNormalCLImage(context->GetContext());
-  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
-  CLImage bias_image;
-  bias_image.set_tensor_data(bias, bias_dim);
-  bias_image.InitCLImage(context->GetContext());
-  VLOG(3) << " --- Bias image: " << bias_image << " --- ";
-  CLImage out_image;
-  out_image.InitEmptyImage(context->GetContext(), out_dim);
-  cl_int status;
-  status = kernel.setArg(0, *in_image.cl_image());
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(1, *bias_image.cl_image());
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(2, *out_image.cl_image());
-  CL_CHECK_FATAL(status);
-
-  if (bias_dim.size() == 1) {
-    int tensor_w = in_dim[3];
-    status = kernel.setArg(3, tensor_w);
-    CL_CHECK_FATAL(status);
-  }
-  size_t width = in_image.ImageWidth();
-  size_t height = in_image.ImageHeight();
-  auto global_work_size = cl::NDRange{width, height};
-  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
-  CL_CHECK_FATAL(status);
-
-  status = context->GetCommandQueue().finish();
-  CL_CHECK_FATAL(status);
-  VLOG(3) << " --- Out image: " << out_image << " --- ";
-  CopyImageData(context, out_image, out);
-}
-
-void pool(CLContext* context,
-          const std::string pooling_type,
-          const int pad_h,
-          const int pad_w,
-          const int stride_h,
-          const int stride_w,
-          const int ksize_h,
-          const int ksize_w,
-          const float* in,
-          const DDim& in_dim,
-          float* out,
-          const DDim& out_dim) {
-  auto kernel =
-      context->GetKernel(string_format("pool_%s", pooling_type.c_str()));
-  CLImage in_image;
-  in_image.set_tensor_data(in, in_dim);
-  in_image.InitNormalCLImage(context->GetContext());
-  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
-  CLImage out_image;
-  out_image.InitEmptyImage(context->GetContext(), out_dim);
-  auto global_work_size = context->DefaultWorkSize(out_image);
-  auto* in_converter =
-      dynamic_cast<CLImageConverterNormal*>(in_image.image_converter());
-  auto* out_converter =
-      dynamic_cast<CLImageConverterNormal*>(out_image.image_converter());
-  const int in_height = in_converter->HeightOfOneBlock();
-  const int in_width = in_converter->WidthOfOneBlock();
-  const int out_height = out_converter->HeightOfOneBlock();
-  const int out_width = out_converter->WidthOfOneBlock();
-  cl_int status;
-  status = kernel.setArg(0, in_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(1, in_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(2, out_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(3, out_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(4, pad_h);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(5, pad_w);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(6, stride_h);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(7, stride_w);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(8, ksize_h);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(9, ksize_w);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(10, *in_image.cl_image());
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(11, *out_image.cl_image());
-  CL_CHECK_FATAL(status);
-
-  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
-  CL_CHECK_FATAL(status);
-
-  status = context->GetCommandQueue().finish();
-  CL_CHECK_FATAL(status);
-  VLOG(3) << " --- Out image: " << out_image << " --- ";
-  CopyImageData(context, out_image, out);
-}
-
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/opencl/cl_caller.h b/lite/backends/opencl/cl_caller.h
index ed5c9153d3cedf140cbf0570b7f71393fb918bf9..1817db9f6bd6d9ecf21978b8293bd9534328de0f 100644
--- a/lite/backends/opencl/cl_caller.h
+++ b/lite/backends/opencl/cl_caller.h
@@ -23,30 +23,5 @@ namespace lite {
 
 bool InitOpenCLRuntime(std::string cl_path);
 
-/// An elementwise_add method to embed OpenCL logic inside, it is used as a
-/// black box so that the framework can remain simple.
-/// NOTE Currently, these methods are quite expensive, we will optimize them
-/// latter.
-void elementwise_add(CLContext* context,
-                     const float* in,
-                     const DDim& in_dim,
-                     const float* bias,
-                     const DDim& bias_dim,
-                     float* out,
-                     const DDim& out_dim);
-
-void pool(CLContext* context,
-          const std::string pooling_type,
-          const int pad_h,
-          const int pad_w,
-          const int stride_h,
-          const int stride_w,
-          const int ksize_h,
-          const int ksize_w,
-          const float* in,
-          const DDim& in_dim,
-          float* out,
-          const DDim& out_dim);
-
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/opencl/cl_functions_test.cc b/lite/backends/opencl/cl_functions_test.cc
index b9f6648c9956e1952b65f66abfa40d912a99ee67..70f47b47946641edf4d023437b48d46cae93ca6e 100644
--- a/lite/backends/opencl/cl_functions_test.cc
+++ b/lite/backends/opencl/cl_functions_test.cc
@@ -41,9 +41,10 @@ TEST(cl_test, runtime_test) {
   auto &context = runtime->context();
   auto program = runtime->CreateProgram(
       context,
-      runtime->cl_path() + "/cl_kernel/" + "image/elementwise_add_kernel.cl");
+      runtime->cl_path() + "/cl_kernel/" + "buffer/elementwise_add_kernel.cl");
   auto event = runtime->CreateEvent(context);
-  CHECK(runtime->BuildProgram(program.get()));
+  const std::string build_option("-DCL_DTYPE_float");
+  CHECK(runtime->BuildProgram(program.get(), build_option));
 }
 
 TEST(cl_test, context_test) {
@@ -51,9 +52,11 @@ TEST(cl_test, context_test) {
   CHECK(runtime->IsInitSuccess());
   runtime->set_cl_path(FLAGS_cl_path);
   CLContext context;
-  context.AddKernel("pool_max", "image/pool_kernel.cl", "");
-  context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", "");
-  context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", "");
+  context.AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float");
+  context.AddKernel(
+      "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
+  context.AddKernel(
+      "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
 }
 
 TEST(cl_test, kernel_test) {
@@ -61,9 +64,11 @@ TEST(cl_test, kernel_test) {
   CHECK(runtime->IsInitSuccess());
   runtime->set_cl_path(FLAGS_cl_path);
   std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
-  context->AddKernel("pool_max", "image/pool_kernel.cl");
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
+  context->AddKernel(
+      "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
+  context->AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float");
+  context->AddKernel(
+      "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
   auto kernel = context->GetKernel(2);
 
   std::unique_ptr<float[]> in_data(new float[4 * 3 * 256 * 512]);
@@ -115,203 +120,12 @@ TEST(cl_test, kernel_test) {
   LOG(INFO) << out_image;
 }
 
-TEST(cl_test, channel_add_test) {
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    in_data[i] = dist(engine);
-  }
-
-  const DDim bias_dim = DDim(std::vector<DDim::value_type>{16});
-  std::unique_ptr<float[]> bias_data(new float[16]);
-  for (int i = 0; i < 16; i++) {
-    bias_data[i] = dist(engine);
-  }
-
-  std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 16; j++) {
-      float b = bias_data[j];
-      for (int k = 0; k < 256 * 512; k++) {
-        int index = (i * 16 + j) * 256 * 512 + k;
-        out_ref[index] = in_data[index] + b;
-      }
-    }
-  }
-
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
-
-  bool status = InitOpenCLRuntime(FLAGS_cl_path);
-  CHECK(status) << "Fail to initialize OpenCL runtime.";
-  std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
-  context->AddKernel("channel_add", "image/channel_add_kernel.cl");
-  elementwise_add(context.get(),
-                  in_data.get(),
-                  in_dim,
-                  bias_data.get(),
-                  bias_dim,
-                  out.get(),
-                  out_dim);
-
-  int stride = 4 * 16 * 256 * 512 / 20;
-  for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
-    std::cout << out[i] << " ";
-  }
-  std::cout << std::endl;
-
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
-  }
-}
-
-TEST(cl_test, elementwise_add_test) {
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    in_data[i] = dist(engine);
-  }
-
-  const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> bias_data(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    bias_data[i] = dist(engine);
-  }
-
-  std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    out_ref[i] = in_data[i] + bias_data[i];
-  }
-
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
-
-  bool status = InitOpenCLRuntime(FLAGS_cl_path);
-  CHECK(status) << "Fail to initialize OpenCL runtime.";
-  std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
-  context->AddKernel("channel_add", "image/channel_add_kernel.cl");
-  elementwise_add(context.get(),
-                  in_data.get(),
-                  in_dim,
-                  bias_data.get(),
-                  bias_dim,
-                  out.get(),
-                  out_dim);
-
-  int stride = 4 * 16 * 256 * 512 / 20;
-  for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
-    std::cout << out[i] << " ";
-  }
-  std::cout << std::endl;
-
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
-  }
-}
-
-void pool_avg(const int padding_height,
-              const int padding_width,
-              const int stride_height,
-              const int stride_width,
-              const int ksize_height,
-              const int ksize_width,
-              const float *input_data,
-              const DDim &in_dim,
-              float *output_data,
-              const DDim &out_dim) {
-  const int batch_size = in_dim[0];
-  const int input_height = in_dim[2];
-  const int input_width = in_dim[3];
-  const int output_channels = out_dim[1];
-  const int output_height = out_dim[2];
-  const int output_width = out_dim[3];
-
-  const size_t input_spatial_size = input_height * input_width;
-  const size_t output_spatial_size = output_height * output_width;
-
-  for (int i = 0; i < batch_size; i++) {
-    for (int c = 0; c < output_channels; ++c) {
-      int channel = i * output_channels + c;
-      const float *input_ptr = input_data + channel * input_spatial_size;
-      float *output_ptr = output_data + channel * output_spatial_size;
-
-      for (int ph = 0; ph < output_height; ++ph) {
-        int hstart = ph * stride_height - padding_height;
-        int hend = std::min(hstart + ksize_height, input_height);
-        hstart = std::max(hstart, 0);
-        for (int pw = 0; pw < output_width; ++pw) {
-          int wstart = pw * stride_width - padding_width;
-          int wend = std::min(wstart + ksize_width, input_width);
-          wstart = std::max(wstart, 0);
-
-          float val = 0.f;
-          int count = 0;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              val += input_ptr[h * input_width + w];
-              ++count;
-            }
-          }
-          output_ptr[ph * output_width + pw] =
-              (count > 0) ? val * (1.f / count) : 0.f;
-        }
-      }
-    }
-  }
-}
-
-TEST(cl_test, pool_test) {
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
-  std::unique_ptr<float[]> in_data(new float[4 * 1024 * 7 * 7]);
-  for (int i = 0; i < 4 * 1024 * 7 * 7; i++) {
-    in_data[i] = dist(engine);
-  }
-
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
-  std::unique_ptr<float[]> out(new float[4 * 1024 * 1 * 1]);
-  std::unique_ptr<float[]> out_ref(new float[4 * 1024 * 1 * 1]);
-
-  bool status = InitOpenCLRuntime(FLAGS_cl_path);
-  CHECK(status) << "Fail to initialize OpenCL runtime.";
-  std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("pool_max", "image/pool_kernel.cl");
-  context->AddKernel("pool_avg", "image/pool_kernel.cl");
-  pool(context.get(),
-       "avg",
-       0,
-       0,
-       1,
-       1,
-       7,
-       7,
-       in_data.get(),
-       in_dim,
-       out.get(),
-       out_dim);
-  pool_avg(0, 0, 1, 1, 7, 7, in_data.get(), in_dim, out_ref.get(), out_dim);
-
-  for (int i = 0; i < 4 * 1024 * 1 * 1; i++) {
-    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
-  }
-}
-
 TEST(cl_test, target_wrapper_buffer_test) {
   bool inited = InitOpenCLRuntime(FLAGS_cl_path);
   CHECK(inited) << "Fail to initialize OpenCL runtime.";
   std::unique_ptr<CLContext> context(new CLContext);
   std::string kernel_name = "elementwise_add";
-  std::string build_options = "-DCL_DTYPE=float";
+  std::string build_options = "-DCL_DTYPE_float";
   context->AddKernel(
       kernel_name, "buffer/elementwise_add_kernel.cl", build_options);
   std::vector<float> h_a;
@@ -396,10 +210,13 @@ TEST(cl_test, target_wrapper_buffer_test) {
 TEST(cl_test, target_wrapper_image_test) {
   const size_t cl_image2d_width = 28;
   const size_t cl_image2d_height = 32;
+  const size_t cl_image2d_elem_size =
+      cl_image2d_width * cl_image2d_height * 4;  // 4 for RGBA channels
   const size_t cl_image2d_row_pitch{0};
   const size_t cl_image2d_slice_pitch{0};
   auto *d_image = static_cast<cl::Image2D *>(
       TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
+
   // Map/Unmap test
   auto *h_image =
       static_cast<float *>(TargetWrapperCL::MapImage(d_image,
@@ -407,15 +224,11 @@ TEST(cl_test, target_wrapper_image_test) {
                                                      cl_image2d_height,
                                                      cl_image2d_row_pitch,
                                                      cl_image2d_slice_pitch));
-  CHECK_EQ(
-      cl_image2d_row_pitch,
-      cl_image2d_width * 4 *
-          4);  // row_pitch = 448 = 28 * 4 (RGBA: 4 floats) * 4 (float in bytes)
-  CHECK_EQ(cl_image2d_slice_pitch, 0);  // slice_pitch = 0
+  CHECK_EQ(cl_image2d_slice_pitch, 0);
   LOG(INFO) << "cl_image2d_row_pitch = " << cl_image2d_row_pitch
             << ", cl_image2d_slice_pitch " << cl_image2d_slice_pitch;
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < cl_image2d_elem_size; i++) {
     h_image[i] = 3.14f * i;
   }
   TargetWrapperCL::Unmap(d_image, h_image);
@@ -426,15 +239,14 @@ TEST(cl_test, target_wrapper_image_test) {
                                                      cl_image2d_height,
                                                      cl_image2d_row_pitch,
                                                      cl_image2d_slice_pitch));
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < cl_image2d_elem_size; i++) {
     EXPECT_NEAR(h_ptr[i], 3.14f * i, 1e-6);
   }
   TargetWrapperCL::Unmap(d_image, h_ptr);
 
   // Imagecpy test
-  std::vector<float> h_image_cpy(cl_image2d_width * 4 *
-                                 cl_image2d_height);  // 4 for RGBA channels
-  for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) {
+  std::vector<float> h_image_cpy(cl_image2d_elem_size);
+  for (int i = 0; i < cl_image2d_elem_size; i++) {
     h_image_cpy[i] = 3.14f;
   }
   TargetWrapperCL::ImgcpySync(d_image,
@@ -446,6 +258,8 @@ TEST(cl_test, target_wrapper_image_test) {
                               IoDirection::HtoD);
   auto *d_image_cpy = static_cast<cl::Image2D *>(
       TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
+
+  // device to device
   TargetWrapperCL::ImgcpySync(d_image_cpy,
                               d_image,
                               cl_image2d_width,
@@ -454,6 +268,8 @@ TEST(cl_test, target_wrapper_image_test) {
                               cl_image2d_slice_pitch,
                               IoDirection::DtoD);
   std::fill(h_image_cpy.begin(), h_image_cpy.end(), 0);
+
+  // host to device
   TargetWrapperCL::ImgcpySync(h_image_cpy.data(),
                               d_image_cpy,
                               cl_image2d_width,
@@ -461,7 +277,7 @@ TEST(cl_test, target_wrapper_image_test) {
                               cl_image2d_row_pitch,
                               cl_image2d_slice_pitch,
                               IoDirection::DtoH);
-  for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) {
+  for (int i = 0; i < cl_image2d_elem_size; i++) {
     EXPECT_NEAR(h_image_cpy[i], 3.14f, 1e-6);
   }
 
diff --git a/lite/backends/opencl/cl_image_converter.h b/lite/backends/opencl/cl_image_converter.h
index 6faa8045576f06d8c636372de644e6b5c164a5f4..962eb8d3ef35bdb603aa4a56181b1124885d5506 100644
--- a/lite/backends/opencl/cl_image_converter.h
+++ b/lite/backends/opencl/cl_image_converter.h
@@ -103,6 +103,7 @@ class CLImageConverterNormal : public CLImageConverterBase {
 };
 
 class CLImageConverterNWBlock : public CLImageConverterBase {
+ public:
   DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
   void NCHWToImage(float *tensor,
                    float *image,
@@ -113,6 +114,7 @@ class CLImageConverterNWBlock : public CLImageConverterBase {
                    const DDim &tensor_dim) override;
 };
 class CLImageConverterDWBlock : public CLImageConverterBase {
+ public:
   DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
   void NCHWToImage(float *tensor,
                    float *image,
diff --git a/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1574cb4a69cd0388698707d8d91c1d9c18b625a2
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void concat2(__global const CL_DTYPE* x_data0, __global const CL_DTYPE* x_data1, __global CL_DTYPE* out_data, 
+    int size, int axis_size, int pre_size, int post_size, int total, int total0, int total1) {
+  const int index = get_global_id(0); 
+  if (index < size){
+    for (int i = 0; i < pre_size; i++){
+        int offset_out = index * post_size + i * total;
+        int offset_in = index * post_size + i * total0;
+        // memcpy(out_data + offset_out, x_data0 + offset_in, post_size);
+        CL_DTYPE* dst = out_data + offset_out;
+        CL_DTYPE* src = x_data0 + offset_in;
+        for (int k = 0; k < post_size; k++){
+           *dst++ = *src++;
+	}
+    }
+  }else if (index < axis_size){
+    for (int i = 0; i < pre_size; i++){
+        int offset_out = index * post_size + i * total;
+        int offset_in = index * post_size + i * total1;
+        // memcpy(out_data + offset_out, x_data1 + offset_in, post_size);
+        CL_DTYPE* dst = out_data + offset_out;
+        CL_DTYPE* src = x_data1 + offset_in;
+        for (int k = 0; k < post_size; k++){
+           *dst++ = *src++;
+        }
+    }
+  }
+}
+
+__kernel void concat_mul(__global const CL_DTYPE* x_data, __global CL_DTYPE* out_data, 
+    int axis_size, int pre_size, int post_size, int start, int total, int total0) {
+  const int index = get_global_id(0); 
+  if (index < axis_size){
+    for (int i = 0; i < pre_size; i++){
+        int offset_out = (start + index) * post_size + i * total;
+        int offset_in = index * post_size + i * total0;
+        // memcpy(out_data + offset_out, x_data + offset_in, post_size);
+        CL_DTYPE* dst = out_data + offset_out;
+        CL_DTYPE* src = x_data + offset_in;
+        for (int k = 0; k < post_size; k++){
+           *dst++ = *src++;
+        }
+    }
+  }
+}
diff --git a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
index c9c16581d67db0c9143e91e13249edfd5901ddb8..532f947dd342b1ee4db69a084111a97ec014237f 100644
--- a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
@@ -61,6 +61,57 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
   write_imagef(output_image, output_pos, output);
 }
 
+// buffer -> image2d_nw
+__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
+                                __write_only image2d_t output_image,
+                                __private const int out_H,
+                                __private const int out_W,
+                                __private const int out_N,
+                                __private const int Stride0,
+                                __private const int Stride1,
+                                __private const int Stride2) {
+  const int out_n = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_ch = get_global_id(2);
+
+  const int out_c = out_ch / out_H;
+  const int out_h = out_ch % out_H;
+
+  const int in_c = out_c; //  index of c in h direction
+
+  const int in_n0 = out_n * 4 + 0;
+  const int in_n1 = out_n * 4 + 1;
+  const int in_n2 = out_n * 4 + 2;
+  const int in_n3 = out_n * 4 + 3;
+
+  const int in_h = out_h;
+  const int in_w = out_w;
+
+  int input_pos0 = in_n0 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+  int input_pos1 = in_n1 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+  int input_pos2 = in_n2 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+  int input_pos3 = in_n3 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+
+  int2 output_pos;
+  output_pos.x = out_n * out_W + out_w;
+  output_pos.y = out_ch;
+
+  CL_DTYPE4 output = (CL_DTYPE4)0.0f;
+  output.x = convert_float(in[input_pos0]);
+  if (out_N - 4 * out_n >= 2) {
+    output.y = convert_float(in[input_pos1]);
+  }
+  if (out_N - 4 * out_n >= 3) {
+    output.z = convert_float(in[input_pos2]);
+  }
+  if (out_N - 4 * out_n >= 4) {
+    output.w = convert_float(in[input_pos3]);
+  }
+  write_imagef(output_image, output_pos, output);
+}
+
+
+
 // image2d -> buffer
 __kernel void image2d_to_buffer(__read_only image2d_t input,
                                 __private const int in_width,
diff --git a/lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..615bf892b321ba67043d41f6032caa758d78c16f
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void sigmoid(__global const CL_DTYPE* x_data, const int count, __global CL_DTYPE* out_data) {
+  const int index = get_global_id(0); 
+  if (index < count) {
+    out_data[index] = 1 / (1 + exp(-x_data[index]));
+  }
+}
diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h
index 7f901fc994ffd82ccfe99f59614a3422260d0dc5..c127c6cec79cb2eb8d82ce6aa6190b23d373ff64 100644
--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
@@ -14,8 +14,17 @@ limitations under the License. */
 
 #pragma once
 
+/////////////////////////////////
+// fp16 enabled, MAX_VALUE, MIN_VALUE
+/////////////////////////////////
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
+#define MAX_VALUE FLT_MAX
+#define MIN_VALUE -FLT_MAX
+
+/////////////////////////////////
+// CL_DTYPE_float / CL_DTYPE_half
+/////////////////////////////////
 // Data type: pass one of macros on host: [CL_DTYPE_float, CL_DYPE_half]
 #ifdef CL_DTYPE_float
 #define CL_DTYPE float
@@ -27,31 +36,43 @@ limitations under the License. */
 #define CL_DTYPE_CHAR h
 #endif
 
+/////////////////////////////////
+// GET_VEC_TYPE
+/////////////////////////////////
 // Note: macro name replacement need twice parser
 #define GET_VEC_TYPE(type__, size__) type__##size__
 #define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
 #define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
 
+/////////////////////////////////
+// CONVERT_TYPE_TO
+/////////////////////////////////
 #define _CONVERT_TYPE_TO(value, type) convert_##type(value)
 #define CONVERT_TYPE_TO(value, type) _CONVERT_TYPE_TO(value, type)
 
+/////////////////////////////////
+// WRITE_IMG_TYPE / READ_IMG_TYPE
+/////////////////////////////////
 #define _WRITE_IMG_TYPE(type_char, img, pos, value) \
   write_image##type_char(img, pos, value)
 #define WRITE_IMG_TYPE(type_char, img, pos, value) \
   _WRITE_IMG_TYPE(type_char, img, pos, value)
 
-#define _READ_IMG_TYPE(type_char, img, pos, sampler) \
+#define _READ_IMG_TYPE(type_char, img, sampler, pos) \
   read_image##type_char(img, sampler, pos)
-#define READ_IMG_TYPE(type_char, img, pos, sampler) \
-  _READ_IMG_TYPE(type_char, img, pos, sampler)
+#define READ_IMG_TYPE(type_char, img, sampler, pos) \
+  _READ_IMG_TYPE(type_char, img, sampler, pos)
 
+/////////////////////////////////
+// activation / activation_type4
+/////////////////////////////////
 inline CL_DTYPE activation(CL_DTYPE in
 #ifdef PRELU
                            ,
                            CL_DTYPE prelu_alpha
 #endif
                            ) {
-  CL_DTYPE output;
+  CL_DTYPE output = in;
 #ifdef PRELU
   output = select(prelu_alpha * in, in, in >= (CL_DTYPE)0);
 #endif
@@ -59,5 +80,30 @@ inline CL_DTYPE activation(CL_DTYPE in
 #ifdef RELU
   output = fmax(in, (CL_DTYPE)0);
 #endif
+
+#ifdef RELU6
+  output = clamp(in, (CL_DTYPE)0, (CL_DTYPE)6);
+#endif
+  return output;
+}
+
+inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
+#ifdef PRELU
+                                  ,
+                                  CL_DTYPE4 prelu_alpha
+#endif
+                                  ) {
+  CL_DTYPE4 output = in;
+#ifdef PRELU
+  output = select(prelu_alpha * in, in, in >= (CL_DTYPE4)0.0);
+#endif
+
+#ifdef RELU
+  output = fmax(in, (CL_DTYPE4)0);
+#endif
+
+#ifdef RELU6
+  output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6);
+#endif
   return output;
 }
diff --git a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..f0335116f87aac34740dd22ac68f2b6265e62445
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void concat2(__read_only image2d_t input0,
+                    __read_only image2d_t input1,
+                    __write_only image2d_t output,
+                    int axis_size, int flag, int width) {
+  const int x = get_global_id(0); // image_width cxw/4
+  const int y = get_global_id(1); // image_height nxh
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  int xx = x / width;
+  if (flag == 0){
+    xx = y / width;
+  }
+  if (xx < axis_size){
+    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(x, y));
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+  }else{
+    int new_val = xx - axis_size;
+    new_val *= width;
+    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(new_val, y));
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+  }
+  // WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+}
+
+__kernel void concat_mul(__read_only image2d_t input0,
+                    __write_only image2d_t output,
+                    int axis_size, int flag, int width, int start) {
+  const int x = get_global_id(0); // image_width cxw/4
+  const int y = get_global_id(1); // image_height nxh
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  int xx = x / width;
+  if (flag == 0){
+    xx = y / width;
+  }
+  
+  if (xx < axis_size && xx >= start){
+    xx -= start;
+   xx *= width;
+    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(xx, y));
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+  }
+  
+}
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..37e03e802c56d3de9ba08e97c9dfb62f8cd76e9a
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
@@ -0,0 +1,385 @@
+#include <cl_common.h>
+
+__kernel void conv2d_1x1(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                         __read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int input_c_origin,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height,
+                         __private const int old_w) {
+  CL_DTYPE zero = 0.0f;
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int out_w0 = out_w;
+  int out_w1 = out_w + global_size_dim1;
+  int out_w2 = out_w + global_size_dim1 * 2;
+  int out_w3 = out_w + global_size_dim1 * 3;
+
+  int outpos_main = mul24(out_c, old_w);
+  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
+  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
+  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
+  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+
+  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
+  int2 in_pos_in_one_block0 =
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
+  int2 in_pos_in_one_block1 =
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
+  int2 in_pos_in_one_block2 =
+      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
+  int2 in_pos_in_one_block3 =
+      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE_CH
+  CL_DTYPE4 output0 =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+  CL_DTYPE4 output1 = output0;
+  CL_DTYPE4 output2 = output0;
+  CL_DTYPE4 output3 = output0;
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos0);
+  CL_DTYPE4 output1 = output0;
+  CL_DTYPE4 output2 = output0;
+  CL_DTYPE4 output3 = output0;
+
+#else
+  CL_DTYPE4 output0 = 0.0f;
+  CL_DTYPE4 output1 = 0.0f;
+  CL_DTYPE4 output2 = 0.0f;
+  CL_DTYPE4 output3 = 0.0f;
+#endif
+
+  int max_w_bound = input_c * input_width;
+  int burndary_index = input_c * 4 - input_c_origin;
+  bool burndary_index_w =
+      burndary_index == 1 || burndary_index == 2 || burndary_index == 3;
+  bool burndary_index_z = burndary_index == 2 || burndary_index == 3;
+  bool burndary_index_y = burndary_index == 3;
+
+  for (int i = 0; i < input_c; ++i) {
+    // ------------0---------------
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
+    CL_DTYPE4 input0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+
+    CL_DTYPE4 weight0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 0));
+    CL_DTYPE4 weight1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 1));
+    CL_DTYPE4 weight2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2));
+    CL_DTYPE4 weight3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3));
+    int bound_gap = max_w_bound - pos_in.x - 1;
+
+    bool outof_bound = bound_gap < input_width && bound_gap >= 0;
+    input0.w = select(input0.w, zero, outof_bound && burndary_index_w);
+    input0.z = select(input0.z, zero, outof_bound && burndary_index_z);
+    input0.y = select(input0.y, zero, outof_bound && burndary_index_y);
+
+    output0 = mad(input0.x, weight0, output0);
+    output0 = mad(input0.y, weight1, output0);
+    output0 = mad(input0.z, weight2, output0);
+    output0 = mad(input0.w, weight3, output0);
+    // -------------1--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
+    CL_DTYPE4 input1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+
+    bound_gap = max_w_bound - pos_in.x - 1;
+
+    outof_bound = bound_gap < input_width && bound_gap >= 0;
+    input1.w = select(input1.w, zero, outof_bound && burndary_index_w);
+    input1.z = select(input1.z, zero, outof_bound && burndary_index_z);
+    input1.y = select(input1.y, zero, outof_bound && burndary_index_y);
+
+    output1 = mad(input1.x, weight0, output1);
+    output1 = mad(input1.y, weight1, output1);
+    output1 = mad(input1.z, weight2, output1);
+    output1 = mad(input1.w, weight3, output1);
+
+    // -------------2--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
+    CL_DTYPE4 input2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+
+    bound_gap = max_w_bound - pos_in.x - 1;
+
+    outof_bound = bound_gap < input_width && bound_gap >= 0;
+    input2.w = select(input2.w, zero, outof_bound && burndary_index_w);
+    input2.z = select(input2.z, zero, outof_bound && burndary_index_z);
+    input2.y = select(input2.y, zero, outof_bound && burndary_index_y);
+
+    output2 = mad(input2.x, weight0, output2);
+    output2 = mad(input2.y, weight1, output2);
+    output2 = mad(input2.z, weight2, output2);
+    output2 = mad(input2.w, weight3, output2);
+
+    // -------------3--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
+    CL_DTYPE4 input3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    bound_gap = max_w_bound - pos_in.x - 1;
+
+    outof_bound = bound_gap < input_width && bound_gap >= 0;
+    input3.w =
+        select(input3.w,
+               zero,
+               outof_bound && (burndary_index == 1 || burndary_index == 2 ||
+                               burndary_index == 3));
+    input3.z =
+        select(input3.z,
+               zero,
+               outof_bound && (burndary_index == 2 || burndary_index == 3));
+    input3.y = select(input3.y, zero, outof_bound && burndary_index == 3);
+
+    output3 = mad(input3.x, weight0, output3);
+    output3 = mad(input3.y, weight1, output3);
+    output3 = mad(input3.z, weight2, output3);
+    output3 = mad(input3.w, weight3, output3);
+  }
+
+#ifdef BATCH_NORM
+  output0 = output0 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output1 = output1 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output2 = output2 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output3 = output3 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+  output0 = activation_type4(output0);
+  output1 = activation_type4(output1);
+  output2 = activation_type4(output2);
+  output3 = activation_type4(output3);
+#endif
+
+  if (out_w0 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
+  }
+
+  if (out_w1 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
+  }
+
+  if (out_w2 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos2, output2);
+  }
+
+  if (out_w3 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
+  }
+}
+
+__kernel void conv2d_1x1_simple(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+    __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+__read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int input_c_origin,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height,
+                         __private const int old_w) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int out_w0 = out_w;
+  int out_w1 = out_w + global_size_dim1;
+  int out_w2 = out_w + global_size_dim1 * 2;
+  int out_w3 = out_w + global_size_dim1 * 3;
+
+  int outpos_main = mul24(out_c, old_w);
+  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
+  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
+  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
+  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+
+  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
+  int2 in_pos_in_one_block0 =
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
+  int2 in_pos_in_one_block1 =
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
+  int2 in_pos_in_one_block2 =
+      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
+  int2 in_pos_in_one_block3 =
+      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE_CH
+  CL_DTYPE4 output0 =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+  CL_DTYPE4 output1 = output0;
+  CL_DTYPE4 output2 = output0;
+  CL_DTYPE4 output3 = output0;
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos0);
+  CL_DTYPE4 output1 = output0;
+  CL_DTYPE4 output2 = output0;
+  CL_DTYPE4 output3 = output0;
+
+#else
+  CL_DTYPE4 output0 = 0.0f;
+  CL_DTYPE4 output1 = 0.0f;
+  CL_DTYPE4 output2 = 0.0f;
+  CL_DTYPE4 output3 = 0.0f;
+#endif
+
+  for (int i = 0; i < input_c; ++i) {
+    // ------------0---------------
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
+    CL_DTYPE4 input0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+
+    CL_DTYPE4 weight0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 0));
+    CL_DTYPE4 weight1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 1));
+    CL_DTYPE4 weight2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2));
+    CL_DTYPE4 weight3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3));
+
+    output0 = mad(input0.x, weight0, output0);
+    output0 = mad(input0.y, weight1, output0);
+    output0 = mad(input0.z, weight2, output0);
+    output0 = mad(input0.w, weight3, output0);
+
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
+    CL_DTYPE4 input1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    output1 = mad(input1.x, weight0, output1);
+    output1 = mad(input1.y, weight1, output1);
+    output1 = mad(input1.z, weight2, output1);
+    output1 = mad(input1.w, weight3, output1);
+
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
+    CL_DTYPE4 input2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    output2 = mad(input2.x, weight0, output2);
+    output2 = mad(input2.y, weight1, output2);
+    output2 = mad(input2.z, weight2, output2);
+    output2 = mad(input2.w, weight3, output2);
+
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
+    CL_DTYPE4 input3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    output3 = mad(input3.x, weight0, output3);
+    output3 = mad(input3.y, weight1, output3);
+    output3 = mad(input3.z, weight2, output3);
+    output3 = mad(input3.w, weight3, output3);
+  }
+
+#ifdef BATCH_NORM
+  output0 = output0 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output1 = output1 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output2 = output2 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output3 = output3 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+
+  output0 = activation_type4(output0);
+  output1 = activation_type4(output1);
+  output2 = activation_type4(output2);
+  output3 = activation_type4(output3);
+
+
+  if (out_w0 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
+  }
+
+  if (out_w1 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
+  }
+
+  if (out_w2 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos2, output2);
+  }
+
+  if (out_w3 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
+  }
+}
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..8d7950d6b897df833ada56e2de5be7c6203de9ea
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
@@ -0,0 +1,428 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void conv2d_3x3(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int dilation,
+                         __private const int input_width,/* of one block */
+                         __private const int input_height,/* of one block */
+                         __private const int output_width,
+                         __private const int output_height,
+                         __private const int output_c,
+                         __private const int filter_channel,
+						 __private const int filter_width,
+						 __private const int filter_height,
+                         __private const int group) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        return;
+    }
+
+
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+#ifdef BIASE_CH
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+    CL_DTYPE4 output = 0.0f;
+#endif
+
+    CL_DTYPE4 input[9]; // 3x3 region of input
+    if (group == 1) {
+        for (int i = 0; i < input_c; ++i) { // each run for 3x3
+            int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+
+            input[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                                (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                                (CL_DTYPE4)(0.0f),
+                                (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+            input[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x, pos_in.y - dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+            input[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+            input[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x - dilation, pos_in.y)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+            input[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x, pos_in.y)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+            input[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x + dilation, pos_in.y)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+            input[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+            input[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x, pos_in.y + dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+            input[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+                int j = 0;
+                int2 pos_of_weight;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                CL_DTYPE4 weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y += 3;
+                CL_DTYPE4 weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y += 3;
+                CL_DTYPE4 weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y += 3;
+                CL_DTYPE4 weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+                j = 1;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+                j = 2;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+                j = 3;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+                j = 4;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+                j = 5;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+               j = 6;
+               pos_of_weight.x = i * 3 + j % 3;
+               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.x += dot(input[j], weight_x);
+
+               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.y += dot(input[j], weight_y);
+
+               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.z += dot(input[j], weight_z);
+
+               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.w += dot(input[j], weight_w);
+
+               j = 7;
+               pos_of_weight.x = i * 3 + j % 3;
+               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.x += dot(input[j], weight_x);
+
+               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.y += dot(input[j], weight_y);
+
+               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.z += dot(input[j], weight_z);
+
+               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.w += dot(input[j], weight_w);
+
+               j = 8;
+               pos_of_weight.x = i * 3 + j % 3;
+               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.x += dot(input[j], weight_x);
+
+               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.y += dot(input[j], weight_y);
+
+               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.z += dot(input[j], weight_z);
+
+               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.w += dot(input[j], weight_w);
+        }
+    } else { // group != 1
+      for (int i = 0; i < 4; i++) {
+        int used_input_channel_num =
+          (out_c * 4 + i) / (output_c / group) * filter_channel;
+        for (int f_c = 0; f_c < filter_channel; ++f_c) {
+          int input_c = used_input_channel_num + f_c;
+          int input_block = input_c / 4;
+          int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
+                               in_pos_in_one_block.y);
+          input[0] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+              (CL_DTYPE4)(0.0f),
+              (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                         in_pos_in_one_block.y - dilation < 0 ||
+                         in_pos_in_one_block.x - dilation >= input_width ||
+                         in_pos_in_one_block.y - dilation >= input_height)
+                        << 15));
+          input[1] =
+              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                                 (int2)(pos_in.x, pos_in.y - dilation)),
+                     (CL_DTYPE4)(0.0f),
+                     (ushort4)((in_pos_in_one_block.x < 0 ||
+                                in_pos_in_one_block.y - dilation < 0 ||
+                                in_pos_in_one_block.x >= input_width ||
+                                in_pos_in_one_block.y - dilation >= input_height)
+                               << 15));
+          input[2] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                         in_pos_in_one_block.y - dilation < 0 ||
+                         in_pos_in_one_block.x + dilation >= input_width ||
+                         in_pos_in_one_block.y - dilation >= input_height)
+                        << 15));
+          input[3] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                                     in_pos_in_one_block.y < 0 ||
+                                     in_pos_in_one_block.x - dilation >= input_width ||
+                                     in_pos_in_one_block.y >= input_height)
+                                    << 15));
+          input[4] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                                     in_pos_in_one_block.x >= input_width ||
+                                     in_pos_in_one_block.y >= input_height)
+                                     << 15));
+          input[5] =
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                               (int2)(pos_in.x + dilation, pos_in.y)),
+                   (CL_DTYPE4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                              in_pos_in_one_block.y < 0 ||
+                              in_pos_in_one_block.x + dilation >= input_width ||
+                              in_pos_in_one_block.y >= input_height)
+                             << 15));
+          input[6] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                                     in_pos_in_one_block.y + dilation < 0 ||
+                                     in_pos_in_one_block.x - dilation >= input_width ||
+                                     in_pos_in_one_block.y + dilation >= input_height)
+                                     << 15));
+          input[7] =
+              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                                 (int2)(pos_in.x, pos_in.y + dilation)),
+                     (CL_DTYPE4)(0.0f),
+                     (ushort4)((in_pos_in_one_block.x < 0 ||
+                                in_pos_in_one_block.y + dilation < 0 ||
+                                in_pos_in_one_block.x >= input_width ||
+                                in_pos_in_one_block.y + dilation >= input_height)
+                                 << 15));
+          input[8] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                                     in_pos_in_one_block.y + dilation < 0 ||
+                                     in_pos_in_one_block.x + dilation >= input_width ||
+                                     in_pos_in_one_block.y + dilation >= input_height)
+                                      << 15));
+
+          CL_DTYPE tmp_out = 0;
+          for (int j = 0; j < 9; j++) {
+            int2 pos_of_weight;
+            pos_of_weight.x = (f_c / 4) * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
+            CL_DTYPE4 weight = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+
+            int f_c_offset = f_c % 4;
+            CL_DTYPE f_value;
+            if (f_c_offset == 0) {
+              f_value = weight.x;
+            } else if (f_c_offset == 1) {
+              f_value = weight.y;
+            } else if (f_c_offset == 2) {
+              f_value = weight.z;
+            } else if (f_c_offset == 3) {
+              f_value = weight.w;
+            }
+
+            int input_c_offset = input_c % 4;
+            CL_DTYPE input_value;
+            if (input_c_offset == 0) {
+              input_value = input[j].x;
+            } else if (input_c_offset == 1) {
+              input_value = input[j].y;
+            } else if (input_c_offset == 2) {
+              input_value = input[j].z;
+            } else if (input_c_offset == 3) {
+              input_value = input[j].w;
+            }
+            tmp_out += f_value * input_value;
+          }
+
+          if (i == 0) {
+            output.x += tmp_out;
+          } else if (i == 1) {
+            output.y += tmp_out;
+          } else if (i == 2) {
+            output.z += tmp_out;
+          } else if (i == 3) {
+            output.w += tmp_out;
+          }
+        }
+      }
+    }
+
+	output = activation_type4(output);
+
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..d856af6a1d4026b1595bc287901e53f64267dc81
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
@@ -0,0 +1,169 @@
+#include <cl_common.h>
+
+__kernel void conv2d_5x5(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                         __read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+
+  const int filter_n0 = 4 * out_c + 0;
+  const int filter_n1 = 4 * out_c + 1;
+  const int filter_n2 = 4 * out_c + 2;
+  const int filter_n3 = 4 * out_c + 3;
+
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
+
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh_in_one_batch;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+#ifdef BIASE_CH
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+  CL_DTYPE4 output = 0.0f;
+#endif
+
+  CL_DTYPE4 input;
+  CL_DTYPE4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                         in_pos_in_one_block.y + batch_index * input_height);
+    for (int j = 0; j < 5; j++) {
+      for (int k = 0; k < 5; k++) {
+        input = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
+                          (int2)(pos_in.x + (j - 2) * dilation,
+                                 pos_in.y + (k - 2) * dilation)),
+            (CL_DTYPE4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        filter_pos0.x = filter_c * 5 + filter_w;
+        filter_pos0.y = filter_n0 * 5 + filter_h;
+
+        filter_pos1.x = filter_c * 5 + filter_w;
+        filter_pos1.y = filter_n1 * 5 + filter_h;
+
+        filter_pos2.x = filter_c * 5 + filter_w;
+        filter_pos2.y = filter_n2 * 5 + filter_h;
+
+        filter_pos3.x = filter_c * 5 + filter_w;
+        filter_pos3.y = filter_n3 * 5 + filter_h;
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
+
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+        //
+        //        if (output_pos.x == 0 && output_pos.y == 5) {
+        //          printf("i,j,k ={ %d, %d , %d }\n", i,j,k);
+        //          printf("in={ %f , %f , %f , %f } \n",
+        //                 convert_float(input.x),
+        //                 convert_float(input.y),
+        //                 convert_float(input.z),
+        //                 convert_float(input.w));
+        //          printf("filter0={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[0].x),
+        //                 convert_float(filter[0].y),
+        //                 convert_float(filter[0].z),
+        //                 convert_float(filter[0].w));
+        //          printf("filter1={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[1].x),
+        //                 convert_float(filter[1].y),
+        //                 convert_float(filter[1].z),
+        //                 convert_float(filter[1].w));
+        //          printf("filter2={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[2].x),
+        //                 convert_float(filter[2].y),
+        //                 convert_float(filter[2].z),
+        //                 convert_float(filter[2].w));
+        //          printf("filter3={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[3].x),
+        //                 convert_float(filter[3].y),
+        //                 convert_float(filter[3].z),
+        //                 convert_float(filter[3].w));
+        //          printf("output={ %f , %f , %f , %f } \n",
+        //                 convert_float(output.x),
+        //                 convert_float(output.y),
+        //                 convert_float(output.z),
+        //                 convert_float(output.w));
+        //        }
+      }
+    }
+  }
+
+#ifdef BATCH_NORM
+        output =
+            output * READ_IMG_TYPE(
+                         CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+        output = activation_type4(output);
+
+        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+      }
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1f99322812c13287af92b52aee6c346309ee006c
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
@@ -0,0 +1,134 @@
+#include <cl_common.h>
+
+__kernel void conv2d_7x7(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                         __read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+
+  const filter_n0 = 4 * out_c + 0;
+  const filter_n1 = 4 * out_c + 1;
+  const filter_n2 = 4 * out_c + 2;
+  const filter_n3 = 4 * out_c + 3;
+
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
+
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh_in_one_batch;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+#ifdef BIASE_CH
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+  CL_DTYPE4 output = 0.0f;
+#endif
+
+  CL_DTYPE4 input;
+  CL_DTYPE4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                         in_pos_in_one_block.y + batch_index * input_height);
+    for (int j = 0; j < 7; j++) {
+      for (int k = 0; k < 7; k++) {
+        input = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
+                          (int2)(pos_in.x + (j - 3) * dilation,
+                                 pos_in.y + (k - 3) * dilation)),
+            (CL_DTYPE4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        filter_pos0.x = filter_c * 7 + filter_w;
+        filter_pos0.y = filter_n0 * 7 + filter_h;
+
+        filter_pos1.x = filter_c * 7 + filter_w;
+        filter_pos1.y = filter_n1 * 7 + filter_h;
+
+        filter_pos2.x = filter_c * 7 + filter_w;
+        filter_pos2.y = filter_n2 * 7 + filter_h;
+
+        filter_pos3.x = filter_c * 7 + filter_w;
+        filter_pos3.y = filter_n3 * 7 + filter_h;
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
+
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+      }
+    }
+  }
+
+#ifdef BATCH_NORM
+  output = output * READ_IMG_TYPE(
+                        CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+           READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+  output = activation_type4(output);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
new file mode 100755
index 0000000000000000000000000000000000000000..27313aea23ed16ecc7a6763dfbbbe63bca18941a
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
@@ -0,0 +1,101 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void depth_conv2d(__private const int global_size_dim0,
+                           __private const int global_size_dim1,
+                           __private const int global_size_dim2,
+                           __read_only image2d_t input,
+                           __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                           __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                           __read_only image2d_t new_scale,
+                           __read_only image2d_t new_biase,
+#endif
+                           __write_only image2d_t output_image,
+                           __private const int stride,
+                           __private const int offset,
+                           __private const int input_c,
+                           __private const int dilation,
+                           __private const int input_width,  /* of one block */
+                           __private const int input_height, /* of one block */
+                           __private const int output_width,
+                           __private const int output_height,
+                           __private const int filter_width,
+                           __private const int filter_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE_CH
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+  CL_DTYPE4 output = 0.0f;
+#endif
+
+  int2 pos_in_input_block =
+      (int2)(out_c * input_width, batch_index * input_height);
+  int2 pos_in_filter_block =
+      (int2)(out_c * filter_width, batch_index * filter_height);
+  int filter_x = pos_in_filter_block.x;
+  int filter_y = pos_in_filter_block.y;
+  int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x;
+  int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y;
+  int2 align = {filter_width / 2, filter_height / 2};
+  for (int fy = 0; fy < filter_height; ++fy) {
+    for (int fx = 0; fx < filter_width; ++fx) {
+      int x_off = fx - align.x;
+      int y_off = fy - align.y;
+      CL_DTYPE4 in = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input,
+                        sampler,
+                        (int2)(input_x_base + x_off, input_y_base + y_off)),
+          (CL_DTYPE4)(0.0f),
+          (ushort4)((in_pos_in_one_block.x + x_off < 0 ||
+                     in_pos_in_one_block.y + y_off < 0 ||
+                     in_pos_in_one_block.x + x_off >= input_width ||
+                     in_pos_in_one_block.y + y_off >= input_height)
+                    << 15));
+      CL_DTYPE4 f = READ_IMG_TYPE(
+          CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + fx, filter_y + fy));
+      output += in * f;
+    }
+  }
+#ifdef BATCH_NORM
+  output = output * READ_IMG_TYPE(
+                        CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+           READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+  output = activation_type4(output);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
new file mode 100755
index 0000000000000000000000000000000000000000..14086dcd16bd1a8770f444bdcd0b6bea78e23b7e
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -0,0 +1,322 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <cl_common.h>
+
+__kernel void depth_conv2d_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int dilation,
+                                              __private const int input_c,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height, /* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    const int batch_index = out_nh / output_height;
+
+    const int out_nh_in_one_batch = out_nh % output_height;
+
+
+    int2 stride_xy = (int2)(stride, stride);
+    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE_CH
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+    CL_DTYPE4 output = 0.0f;
+#endif
+
+    const int filter_width = 3;
+    const int filter_height = 3;
+
+    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
+
+    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
+
+    int filter_x = pos_in_filter_block.x ;
+    int filter_y = pos_in_filter_block.y ;
+
+    CL_DTYPE4 inputs[9];
+
+        inputs[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        /*
+        if (output_pos.x == 112 && output_pos.y == 0) {
+              CL_DTYPE4 input1 = inputs[3];
+              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+              printf(" input4 3 - %v4hlf \n", in);
+              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
+        }
+        */
+
+
+        inputs[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        inputs[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        inputs[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+        inputs[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+        inputs[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+    CL_DTYPE4 filters[9];
+    filters[0] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
+    filters[1] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
+    filters[2] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
+    filters[3] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
+    filters[4] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+    filters[5] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+    filters[6] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
+    filters[7] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+    filters[8] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+
+    for(int i = 0 ;i < 9 ; i++){
+     output += inputs[i] * filters[i];
+    }
+#ifdef BATCH_NORM
+    output = output * READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+    output = activation_type4(output);
+#endif
+
+
+    /*
+
+    if (output_pos.x == 112 && output_pos.y == 0) {
+
+        for (int i = 0; i < 9; ++i) {
+            CL_DTYPE4 input1 = inputs[i];
+            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+            printf(" input4 %d - %v4hlf \n", i, in);
+        }
+
+        float4 out = (float4)(output.x, output.y, output.z, output.w);
+        printf(" depth wise output output4 = %v4hlf \n", out);
+        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
+        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
+        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
+        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
+    }
+
+    */
+
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+
+}
+
+
+
+__kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
+                                              __private const int ou_w_blk,
+                                              __private const int ou_nh,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int pad,
+                                              __private const int dilation,
+                                              __private const int in_ch,
+                                              __private const int in_w,/* of one block */
+                                              __private const int in_h, /* of one block */
+                                              __private const int ou_w,
+                                              __private const int ou_h) {
+
+    const int ou_ch_blk_id = get_global_id(0);
+    const int ou_w_blk_id = get_global_id(1);
+    const int ou_nh_id = get_global_id(2);
+    const int w_blk_size = 2;
+
+    const int batch_id = ou_nh_id / ou_h;
+    int ou_col_id = ou_w_blk_id * w_blk_size;
+    int ou_row_id = ou_nh_id % ou_h;
+    int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
+
+    // input pos in one block and on batch
+    int col_id = ou_col_id - pad;
+    int row_id = ou_row_id - pad;
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+#ifdef BIASE_CH
+    CL_DTYPE4 output[2];
+    output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0));
+    output[1] = output[0];
+#elif defined(BIASE_ELE)
+    CL_DTYPE4 output[2];
+    output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id));
+    if (ou_col_id + 1 < ou_w) {
+        output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id));
+    }
+#else
+    CL_DTYPE4 output[2] = {0.0f};
+#endif
+
+    CL_DTYPE4 inputs[12];
+
+    int filter_x = ou_ch_blk_id * 3;
+    int filter_y = 0;
+    CL_DTYPE4 filters[9];
+    filters[0] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
+    filters[1] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
+    filters[2] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
+
+    int in_x = mad24(ou_ch_blk_id, in_w, col_id);
+    int in_y = mad24(batch_id, in_h, row_id);
+
+    int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
+    int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
+    inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0));
+    int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
+    inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0));
+    int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
+    inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0));
+    int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
+    inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0));
+
+    output[0] = mad(inputs[0], filters[0], output[0]);
+    output[1] = mad(inputs[1], filters[0], output[1]);
+
+    output[0] = mad(inputs[1], filters[1], output[0]);
+    output[1] = mad(inputs[2], filters[1], output[1]);
+
+    output[0] = mad(inputs[2], filters[2], output[0]);
+    output[1] = mad(inputs[3], filters[2], output[1]);
+
+
+    filters[3] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
+    filters[4] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+    filters[5] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+
+
+    int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
+    inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1));
+    inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1));
+    inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1));
+    inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1));
+
+
+    output[0] = mad(inputs[4], filters[3], output[0]);
+    output[1] = mad(inputs[5], filters[3], output[1]);
+
+    output[0] = mad(inputs[5], filters[4], output[0]);
+    output[1] = mad(inputs[6], filters[4], output[1]);
+
+    output[0] = mad(inputs[6], filters[5], output[0]);
+    output[1] = mad(inputs[7], filters[5], output[1]);
+
+
+    filters[6] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
+    filters[7] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+    filters[8] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+
+    int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
+    inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2));
+    inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2));
+    inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2));
+    inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2));
+
+
+    output[0] = mad(inputs[8], filters[6], output[0]);
+    output[1] = mad(inputs[9], filters[6], output[1]);
+
+    output[0] = mad(inputs[9], filters[7], output[0]);
+    output[1] = mad(inputs[10], filters[7], output[1]);
+
+    output[0] = mad(inputs[10], filters[8], output[0]);
+    output[1] = mad(inputs[11], filters[8], output[1]);
+#ifdef BATCH_NORM
+    CL_DTYPE4 scale = READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(ou_ch_blk_id, 0));
+    CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(ou_ch_blk_id, 0));
+    output[0] = mad(scale, output[0], biase);
+    if (ou_col_id + 1 < ou_w) {
+        output[1] = mad(scale, output[1], biase);
+    }
+#endif
+
+#ifdef RELU
+    output[0] = activation_type4(output[0]);
+    output[1] = activation_type4(output[1]);
+#endif
+
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
+    if (ou_col_id + 1 < ou_w) {
+        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
+    }
+
+}
+
diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
index ecf719ae9316ed14743e872a1c2cde4b254b35ff..0d8867e6a79b57927c0d23ff549d3b845556dfd8 100644
--- a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
@@ -12,15 +12,74 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) {
+#include <cl_common.h>
+
+__kernel void elementwise_add(__read_only image2d_t input,
+                              __read_only image2d_t bias,
+                              __write_only image2d_t outputImage) {
      int x = get_global_id(0);
      int y = get_global_id(1);
+
      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
      int2 coords;
      coords.x = x;
      coords.y = y;
-     float4 in = read_imagef(input, sampler, coords);
-     float4 biase = read_imagef(bias, sampler, coords);
-     float4 output = in + biase;
-     write_imagef(outputImage,coords,output);
+
+     CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+     CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords);
+     CL_DTYPE4 output = activation_type4(in + biase);
+
+     WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage,coords,output);
  }
+
+__kernel void channel_add(__read_only image2d_t input,
+                          __read_only image2d_t bias,
+                          __write_only image2d_t outputImage,
+                          int w) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+
+     int2 coords_bias;
+     coords_bias.x = x % w;
+     coords_bias.y = 0;
+
+     CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+     CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+     CL_DTYPE4 output = in + (CL_DTYPE4)(biase.x);
+
+     WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+ }
+
+__kernel void width_add(__read_only image2d_t input,
+                        __read_only image2d_t bias,
+                        __write_only image2d_t outputImage,
+                        int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+
+  int2 coords_bias;
+  coords_bias.x = x % w;
+  coords_bias.y = 0;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output;
+
+  output.x = in.x + biase.x;
+  output.y = in.y + biase.x;
+  output.z = in.z + biase.x;
+  output.w = in.w + biase.x;
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..17b6e8c72a82718a541841ff3c69c175649d7056
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
@@ -0,0 +1,100 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
+                              __write_only image2d_t outputImage) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords);
+  CL_DTYPE4 output = in * biase;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+__kernel void channel_mul_d1(__read_only image2d_t input, __read_only image2d_t bias,
+                             __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+
+  int2 coords_bias;
+  coords_bias.x = x % w;
+  coords_bias.y = 0;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * (CL_DTYPE4)(biase.x);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+__kernel void channel_mul_d2(__read_only image2d_t input, __read_only image2d_t bias,
+                             __write_only image2d_t outputImage, int w, int h) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+
+  int2 coords_bias;
+  coords_bias.x = x % w;
+  coords_bias.y = y % h;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * (CL_DTYPE4)(biase.x);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+__kernel void channel_mul_d4(__read_only image2d_t input, __read_only image2d_t bias,
+                             __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * biase;
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
diff --git a/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..b74449d9c8a02551cd74d366849768b4a91a4dce
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
+                             __private const float scale_h, __private const float scale_w,
+                             __private const int in_dims_h, __private const int out_dims_h,
+                             __private const int in_dims_w, __private const int out_dims_w) {
+                             const int c = get_global_id(0);
+                             const int w = get_global_id(1);
+                             const int nh = get_global_id(2);
+                             int2 output_pos;
+                             output_pos.x = c * out_dims_w + w;
+                             output_pos.y = nh;
+                             int out_n = nh / out_dims_h;
+                             int out_h = nh % out_dims_h;
+                             int2 input_pos;
+                             input_pos.x = c * in_dims_w + w / scale_w;
+                             input_pos.y = out_n * in_dims_h + out_h / scale_h;
+
+                             const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                             CLK_ADDRESS_CLAMP |
+                                                             CLK_FILTER_NEAREST;
+                             half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
+                             write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
index 0ca3b9141daf671737af8d24cd03e59587e33350..775166261d01dc639cd5af8cee49f7e7fb30cb19 100644
--- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
@@ -12,15 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define MIN_VALUE -FLT_MAX
-
-__kernel void pool_max(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
+#include <cl_common.h>
+
+__kernel void pool_max(__read_only image2d_t input,
+    __write_only image2d_t output,
+    __private const int in_height,
+    __private const int in_width,
+    __private const int out_height,
+    __private const int out_width,
+    __private const int ksize_h,
+    __private const int ksize_w,
+    __private const int stride_h,
+    __private const int stride_w,
+    __private const int pad_top,
+    __private const int pad_left) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -40,25 +45,30 @@ __kernel void pool_max(
 
   const int pos_in_x = out_c * in_width;
   const int pos_in_y = out_n * in_height;
-  float4 max_value = (float4)(MIN_VALUE);
+  CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE);
   for (int y = start_h; y < end_h; ++y) {
     for (int x = start_w; x < end_w; ++x) {
-      float4 tmp = read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      CL_DTYPE4 tmp = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
       max_value = max(max_value, tmp);
     }
   }
 
   const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imagef(output, (int2)(pos_out_x, out_nh), max_value);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), max_value);
 }
 
-__kernel void pool_avg(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
+__kernel void pool_avg(__read_only image2d_t input,
+  __write_only image2d_t output,
+  __private const int in_height,
+  __private const int in_width,
+  __private const int out_height,
+  __private const int out_width,
+  __private const int ksize_h,
+  __private const int ksize_w,
+  __private const int stride_h,
+  __private const int stride_w,
+  __private const int pad_top,
+  __private const int pad_left) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -76,15 +86,14 @@ __kernel void pool_avg(
 
   const int pos_in_x = out_c * in_width;
   const int pos_in_y = out_n * in_height;
-  float4 sum = (float4)(0.0f);
-  int num = 0;
+  CL_DTYPE4 sum = (CL_DTYPE4)(0.0f);
+
   for (int y = start_h; y < end_h; ++y) {
     for (int x = start_w; x < end_w; ++x) {
-      sum += read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
-      num++;
+      sum += READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
     }
   }
-  float4 avg = sum / num;
+  CL_DTYPE4 avg = sum / (ksize_h * ksize_w);
   const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imagef(output, (int2)(pos_out_x, out_nh), avg);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), avg);
 }
diff --git a/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl b/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..7750bd98a29151ba2428bdafd462420393fe7433
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void relu6(__read_only image2d_t input,
+                    __write_only image2d_t output,
+                    __private const float threshold){
+
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl b/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
index a99ac79d32bcedb48354d2e179ef6c8c1ff7f997..43a27067c2f2c418d314f9bce95bccbbb51a9be0 100644
--- a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
@@ -24,7 +24,7 @@ __kernel void relu(__read_only image2d_t input,
                             CLK_ADDRESS_CLAMP |
                             CLK_FILTER_NEAREST;
 
-  CL_DTYPE4 in = read_imagef(input, sampler, (int2)(x, y));
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
   in = max((CL_DTYPE4)(0.0f), in);
-  write_imagef(output, (int2)(x, y), in);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
diff --git a/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..314be875d29d2125f9573d33010ee9d33317ea71
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl
@@ -0,0 +1,162 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void reshape(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_C,
+                      __private const int out_H,
+                      __private const int out_W,
+                      __private const int in_W,
+                      __private const int in_H,
+                      __private const int in_Stride0,
+                      __private const int in_Stride1,
+                      __private const int in_Stride2,
+                      __private const int out_Stride0,
+                      __private const int out_Stride1,
+                      __private const int out_Stride2) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_H;
+  const int out_h = out_nh % out_H;
+  const int out_c0 = out_c * 4;
+  const int out_c1 = out_c * 4 + 1;
+  const int out_c2 = out_c * 4 + 2;
+  const int out_c3 = out_c * 4 + 3;
+
+  int count0 =
+      out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w;
+  int count1 =
+      out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w;
+  int count2 =
+      out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w;
+  int count3 =
+      out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w;
+
+  int in_n0 = count0 / in_Stride2;
+  int in_n1 = count1 / in_Stride2;
+  int in_n2 = count1 / in_Stride2;
+  int in_n3 = count2 / in_Stride2;
+
+  count0 = count0 % in_Stride2;
+  count1 = count1 % in_Stride2;
+  count2 = count2 % in_Stride2;
+  count3 = count3 % in_Stride2;
+
+  int in_c0 = count0 / in_Stride1;
+  int in_c1 = count1 / in_Stride1;
+  int in_c2 = count2 / in_Stride1;
+  int in_c3 = count3 / in_Stride1;
+
+  int in_h0 = (count0 % in_Stride1) / in_Stride0;
+  int in_h1 = (count1 % in_Stride1) / in_Stride0;
+  int in_h2 = (count2 % in_Stride1) / in_Stride0;
+  int in_h3 = (count3 % in_Stride1) / in_Stride0;
+
+  int in_w0 = (count0 % in_Stride1) % in_Stride0;
+  int in_w1 = (count1 % in_Stride1) % in_Stride0;
+  int in_w2 = (count2 % in_Stride1) % in_Stride0;
+  int in_w3 = (count3 % in_Stride1) % in_Stride0;
+
+  int2 input_pos0;
+  int2 input_pos1;
+  int2 input_pos2;
+  int2 input_pos3;
+
+  input_pos0.x = (in_c0 / 4) * in_W + in_w0;
+  input_pos0.y = in_n0 * in_H + in_h0;
+
+  input_pos1.x = (in_c1 / 4) * in_W + in_w1;
+  input_pos1.y = in_n1 * in_H + in_h1;
+
+  input_pos2.x = (in_c2 / 4) * in_W + in_w2;
+  input_pos2.y = in_n2 * in_H + in_h2;
+
+  input_pos3.x = (in_c3 / 4) * in_W + in_w3;
+  input_pos3.y = in_n3 * in_H + in_h3;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 input0;
+  CL_DTYPE4 input1;
+  CL_DTYPE4 input2;
+  CL_DTYPE4 input3;
+  CL_DTYPE4 output;
+
+  input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0);
+  if (in_c0 % 4 == 0) {
+    output.x = input0.x;
+  } else if (in_c0 % 4 == 1) {
+    output.x = input0.y;
+  } else if (in_c0 % 4 == 2) {
+    output.x = input0.z;
+  } else {
+    output.x = input0.w;
+  }
+  if (out_C - out_c * 4 >= 2) {
+    input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1);
+    if (in_c1 % 4 == 0) {
+      output.y = input1.x;
+    } else if (in_c1 % 4 == 1) {
+      output.y = input1.y;
+    } else if (in_c1 % 4 == 2) {
+      output.y = input1.z;
+    } else {
+      output.y = input1.w;
+    }
+
+  } else {
+    output.y = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 3) {
+    input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2);
+
+    if (in_c2 % 4 == 0) {
+      output.z = input2.x;
+    } else if (in_c2 % 4 == 1) {
+      output.z = input1.y;
+    } else if (in_c2 % 4 == 2) {
+      output.z = input2.z;
+    } else {
+      output.z = input2.w;
+    }
+  } else {
+    output.z = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 4) {
+    input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3);
+    if (in_c3 % 4 == 0) {
+      output.w = input3.x;
+    } else if (in_c3 % 4 == 1) {
+      output.w = input3.y;
+    } else if (in_c3 % 4 == 2) {
+      output.w = input3.z;
+    } else {
+      output.w = input3.w;
+    }
+  } else {
+    output.w = 0.0f;
+  }
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..739ff1338582b65d87dbd9c92f1ea86e0c49f0ff
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void scale(__read_only image2d_t input,
+                    __write_only image2d_t output,
+                    __private float scale,
+                    __private float bias){
+
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  in = convert_float(scale) * in + convert_float(bias);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl b/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..d2cb8fa36e21167979172fba634a7862c932b74c
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void sigmoid(__read_only image2d_t input,
+                   __write_only image2d_t output) {
+
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  CL_DTYPE4 out = 1 / (1 + exp(-in));
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
+}
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index c2504ab611e93399c70169f3f123d4a0514c07ad..0c7b2f8575a88082f6d79a5392c4468715a701b9 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -103,6 +103,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
 bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
   std::string build_option = options + " -cl-fast-relaxed-math -I " +
                              CLRuntime::Global()->cl_path() + "/cl_kernel";
+  VLOG(4) << "OpenCL build_option: " << build_option;
   status_ = program->build({*device_}, build_option.c_str());
   CL_CHECK_ERROR(status_);
 
diff --git a/lite/backends/opencl/target_wrapper.cc b/lite/backends/opencl/target_wrapper.cc
index 575f87d0f8d0192345c6ab111db46715a809a976..310567baa539697f6a67b59f6c0e5f29ce46a80e 100644
--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
@@ -24,6 +24,8 @@ static cl_channel_type GetCLChannelType(const PrecisionType type) {
   switch (type) {
     case PRECISION(kFloat):
       return CL_FLOAT;
+    case PRECISION(kFP16):
+      return CL_HALF_FLOAT;
     case PRECISION(kInt32):
       return CL_SIGNED_INT32;
     case PRECISION(kInt8):
@@ -58,17 +60,18 @@ void TargetWrapperCL::Free(void *ptr) {
 
 template <>
 void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
-                                          const size_t cl_image2d_height) {
+                                          const size_t cl_image2d_height,
+                                          void *host_ptr) {
   cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFloat)));
   cl_int status;
   cl::Image2D *cl_image =
       new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
                       img_format,
                       cl_image2d_width,
                       cl_image2d_height,
                       0,
-                      nullptr,
+                      host_ptr,
                       &status);
   if (status != CL_SUCCESS) {
     delete cl_image;
@@ -78,19 +81,20 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
   return cl_image;
 }
 
-template <>
-void *TargetWrapperCL::MallocImage<int8_t>(const size_t cl_image2d_width,
-                                           const size_t cl_image2d_height) {
-  cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt8)));
+template <>  // use int16_t represents half float
+void *TargetWrapperCL::MallocImage<int16_t>(const size_t cl_image2d_width,
+                                            const size_t cl_image2d_height,
+                                            void *host_ptr) {
+  cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFP16)));
   cl_int status;
   cl::Image2D *cl_image =
       new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
                       img_format,
                       cl_image2d_width,
                       cl_image2d_height,
                       0,
-                      nullptr,
+                      host_ptr,
                       &status);
   if (status != CL_SUCCESS) {
     delete cl_image;
@@ -102,17 +106,18 @@ void *TargetWrapperCL::MallocImage<int8_t>(const size_t cl_image2d_width,
 
 template <>
 void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
-                                            const size_t cl_image2d_height) {
+                                            const size_t cl_image2d_height,
+                                            void *host_ptr) {
   cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt32)));
   cl_int status;
   cl::Image2D *cl_image =
       new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
                       img_format,
                       cl_image2d_width,
                       cl_image2d_height,
                       0,
-                      nullptr,
+                      host_ptr,
                       &status);
   if (status != CL_SUCCESS) {
     delete cl_image;
diff --git a/lite/backends/opencl/target_wrapper.h b/lite/backends/opencl/target_wrapper.h
index 7753448052e17ac739f730c9fabcaf9533e0045e..c5ff9e900a70fd96ccb461c74fb61e33815a5e81 100644
--- a/lite/backends/opencl/target_wrapper.h
+++ b/lite/backends/opencl/target_wrapper.h
@@ -48,7 +48,8 @@ class TargetWrapper<TARGET(kOpenCL), cl::CommandQueue, cl::Event> {
 
   template <typename R>
   static void* MallocImage(const size_t cl_image2d_width,
-                           const size_t cl_image2d_height);
+                           const size_t cl_image2d_height,
+                           void* host_ptr = nullptr);
   static void FreeImage(void* image);
 
   static void* Map(void* buffer, size_t offset, size_t size);
diff --git a/lite/backends/x86/cpu_info.cc b/lite/backends/x86/cpu_info.cc
index c2759d6191aaa7ba277ff2a935ea6fdda8383e1e..aa097f947a0289b4a44417160fbe5d6e6db48020 100644
--- a/lite/backends/x86/cpu_info.cc
+++ b/lite/backends/x86/cpu_info.cc
@@ -32,26 +32,37 @@
 #include <gflags/gflags.h>
 #include <algorithm>
 
-DEFINE_double(fraction_of_cpu_memory_to_use,
-              1,
-              "Default use 100% of CPU memory for PaddlePaddle,"
-              "reserve the rest for page tables, etc");
-DEFINE_uint64(initial_cpu_memory_in_mb,
-              500ul,
-              "Initial CPU memory for PaddlePaddle, in MD unit.");
-
-DEFINE_double(
-    fraction_of_cuda_pinned_memory_to_use,
-    0.5,
-    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
-    "reserve the rest for page tables, etc");
+#include "lite/utils/env.h"
+
+// DEFINE_double(fraction_of_cpu_memory_to_use,
+//               1,
+//               "Default use 100% of CPU memory for PaddlePaddle,"
+//               "reserve the rest for page tables, etc");
+double fraction_of_cpu_memory_to_use =
+    paddle::lite::GetDoubleFromEnv("fraction_of_cpu_memory_to_use", 1);
+
+// DEFINE_uint64(initial_cpu_memory_in_mb,
+//               500ul,
+//               "Initial CPU memory for PaddlePaddle, in MD unit.");
+uint64_t initial_cpu_memory_in_mb =
+    paddle::lite::GetUInt64FromEnv("initial_cpu_memory_in_mb", 500ul);
+
+// DEFINE_double(
+//     fraction_of_cuda_pinned_memory_to_use,
+//     0.5,
+//     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
+//     "reserve the rest for page tables, etc");
+double fraction_of_cuda_pinned_memory_to_use = paddle::lite::GetDoubleFromEnv(
+    "fraction_of_cuda_pinned_memory_to_use", 0.5);
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+// DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+bool use_pinned_memory =
+    paddle::lite::GetBoolFromEnv("use_pinned_memory", true);
 
 namespace paddle {
 namespace lite {
@@ -81,7 +92,7 @@ size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
   // For distributed systems, it requires configuring and limiting
   // the fraction of memory to use.
-  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+  return fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
 }
 
 size_t CpuMinChunkSize() {
@@ -92,15 +103,14 @@ size_t CpuMinChunkSize() {
 size_t CpuMaxChunkSize() {
   // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
   // or the initial_cpu_memory_in_mb.
-  return std::min(
-      static_cast<size_t>(CpuMaxAllocSize() / 32),
-      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
+  return std::min(static_cast<size_t>(CpuMaxAllocSize() / 32),
+                  static_cast<size_t>(initial_cpu_memory_in_mb * 1 << 20));
 }
 
 size_t CUDAPinnedMaxAllocSize() {
   // For distributed systems, it requires configuring and limiting
   // the fraction of memory to use.
-  return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
+  return fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
 }
 
 size_t CUDAPinnedMinChunkSize() {
diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc
index 75bb528f38664fc1061653e1036b73eed74daae9..a05a57e93b23008e49683764b5ed669d5c425e5b 100644
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
@@ -22,36 +22,46 @@ limitations under the License. */
 #include "lite/backends/x86/cupti_lib_path.h"
 #include "lite/backends/x86/port.h"
 #include "lite/backends/x86/warpctc_lib_path.h"
+#include "lite/utils/env.h"
 #include "lite/utils/paddle_enforce.h"
 
-DEFINE_string(cudnn_dir,
-              "",
-              "Specify path for loading libcudnn.so. For instance, "
-              "/usr/local/cudnn/lib. If empty [default], dlopen "
-              "will search cudnn from LD_LIBRARY_PATH");
+// DEFINE_string(cudnn_dir,
+//               "",
+//               "Specify path for loading libcudnn.so. For instance, "
+//               "/usr/local/cudnn/lib. If empty [default], dlopen "
+//               "will search cudnn from LD_LIBRARY_PATH");
+std::string cudnn_dir = paddle::lite::GetStringFromEnv("cudnn_dir");  // NOLINT
 
-DEFINE_string(cuda_dir,
-              "",
-              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
+// DEFINE_string(cuda_dir,
+//               "",
+//               "Specify path for loading cuda library, such as libcublas, "
+//               "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+//               "dlopen will search cuda from LD_LIBRARY_PATH");
+std::string cuda_dir = paddle::lite::GetStringFromEnv("cuda_dir");  // NOLINT
 
-DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+// DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+std::string f_warpctc_dir =                         // NOLINT
+    paddle::lite::GetStringFromEnv("warpctc_dir");  // NOLINT
 
-DEFINE_string(nccl_dir,
-              "",
-              "Specify path for loading nccl library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
+// DEFINE_string(nccl_dir,
+//               "",
+//               "Specify path for loading nccl library, such as libcublas, "
+//               "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+//               "dlopen will search cuda from LD_LIBRARY_PATH");
+std::string nccl_dir = paddle::lite::GetStringFromEnv("nccl_dir");  // NOLINT
 
-DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
+// DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
+std::string cupti_dir = paddle::lite::GetStringFromEnv("cupti_dir");  // NOLINT
 
-DEFINE_string(
-    tensorrt_dir,
-    "",
-    "Specify path for loading tensorrt library, such as libnvinfer.so.");
+// DEFINE_string(
+//     tensorrt_dir,
+//     "",
+//     "Specify path for loading tensorrt library, such as libnvinfer.so.");
+std::string tensorrt_dir =                           // NOLINT
+    paddle::lite::GetStringFromEnv("tensorrt_dir");  // NOLINT
 
-DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
+// DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
+std::string mklml_dir = paddle::lite::GetStringFromEnv("mklml_dir");  // NOLINT
 
 namespace paddle {
 namespace lite {
@@ -180,28 +190,28 @@ auto error_msg =
 
 void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
+  return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
+  return GetDsoHandleFromSearchPath(cuda_dir, win_cublas_lib);
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
+  return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.so");
 #endif
 }
 
 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
+  return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.dylib", false);
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
+  return GetDsoHandleFromSearchPath(cudnn_dir, win_cudnn_lib);
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
+  return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.so", false);
 #endif
 }
 
 void* GetCUPTIDsoHandle() {
   std::string cupti_path = cupti_lib_path;
-  if (!FLAGS_cupti_dir.empty()) {
-    cupti_path = FLAGS_cupti_dir;
+  if (!cupti_dir.empty()) {
+    cupti_path = cupti_dir;
   }
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
@@ -212,18 +222,18 @@ void* GetCUPTIDsoHandle() {
 
 void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
+  return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
+  return GetDsoHandleFromSearchPath(cuda_dir, win_curand_lib);
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
+  return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.so");
 #endif
 }
 
 void* GetWarpCTCDsoHandle() {
   std::string warpctc_dir = warpctc_lib_path;
-  if (!FLAGS_warpctc_dir.empty()) {
-    warpctc_dir = FLAGS_warpctc_dir;
+  if (!f_warpctc_dir.empty()) {
+    warpctc_dir = f_warpctc_dir;
   }
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
@@ -236,27 +246,27 @@ void* GetWarpCTCDsoHandle() {
 
 void* GetNCCLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
+  return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.dylib");
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
+  return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.so");
 #endif
 }
 
 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
+  return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.dylib");
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
+  return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.so");
 #endif
 }
 
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib");
 #elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
+  return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll");
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
+  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.so");
 #endif
 }
 
diff --git a/lite/backends/x86/jit/README.en.md b/lite/backends/x86/jit/README.en.md
index cd2aa5c242dba1a9be669a536cd9b614bf890e48..dc9eb4cf239155ba15a855c98e5515adb717d2d5 100644
--- a/lite/backends/x86/jit/README.en.md
+++ b/lite/backends/x86/jit/README.en.md
@@ -89,7 +89,7 @@ All kernels are inlcuded in `lite/backends/x86/jit/kernels.h`, which is automati
 3. Add reference function of `your_key`. 
 Note:
     - this should be run on CPU and do not depend on any third-party.
-    - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
+    - Add `USE_JITKERNEL_REFER_LITE(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
 4. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
 Test more data type for some special functions if necessary, for example `int8`.
 5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one.
diff --git a/lite/backends/x86/jit/README.md b/lite/backends/x86/jit/README.md
index 6998c5d867b079dfef69a71ca56e6f3fc30363d4..bc0e27234d05c82c9b0dcc431343d7db1a0f4067 100644
--- a/lite/backends/x86/jit/README.md
+++ b/lite/backends/x86/jit/README.md
@@ -79,7 +79,7 @@ PaddlePaddle/Paddle/paddle/fluid/
 # 如何添加新的算子
 
 1. 在`KernelType` 中添加 `your_key` 。
-2. 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。
+2. 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER_LITE(your_key)`来使用该kernel。
 3. (optional) 实现更多的算法在`more`目录下，可以依赖mkl，intrinsic或者mkldnn等第三方库。
 4. (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在与refer相同的`KernelType`上。
 5. 添加新的`KernelTuple`，需要与`KernelType`一一对应，是所有类型的一个打包，包括数据类型，属性的类型，以及返回的函数类型。可以参考`SeqPoolTuple`，新加的Attr类型需要特例化`JitCodeKey`方法。
diff --git a/lite/backends/x86/jit/gen/CMakeLists.txt b/lite/backends/x86/jit/gen/CMakeLists.txt
index 99244ea9bd919a018732b75d1ab811e8bf338516..62500775282d1c3d960f0fa9b00d3d4a2aef9390 100644
--- a/lite/backends/x86/jit/gen/CMakeLists.txt
+++ b/lite/backends/x86/jit/gen/CMakeLists.txt
@@ -4,33 +4,33 @@ file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
 cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak)
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE)
 
-function(USE_JITKERNEL_GEN TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")
+function(USE_JITKERNEL_GEN_LITE TARGET)
+    file(APPEND ${jit_file} "USE_JITKERNEL_GEN_LITE(${TARGET});\n")
 endfunction()
 
 # use gen jitcode kernel by name
-USE_JITKERNEL_GEN(kMatMul)
-USE_JITKERNEL_GEN(kVMul)
-USE_JITKERNEL_GEN(kVAdd)
-USE_JITKERNEL_GEN(kVSub)
-USE_JITKERNEL_GEN(kVAddRelu)
-USE_JITKERNEL_GEN(kVScal)
-USE_JITKERNEL_GEN(kVAddBias)
-USE_JITKERNEL_GEN(kVRelu)
-USE_JITKERNEL_GEN(kVSquare)
-USE_JITKERNEL_GEN(kVIdentity)
-USE_JITKERNEL_GEN(kVExp)
-USE_JITKERNEL_GEN(kVSigmoid)
-USE_JITKERNEL_GEN(kVTanh)
-USE_JITKERNEL_GEN(kLSTMCtHt)
-USE_JITKERNEL_GEN(kLSTMC1H1)
-USE_JITKERNEL_GEN(kGRUH1)
-USE_JITKERNEL_GEN(kGRUHtPart1)
-USE_JITKERNEL_GEN(kGRUHtPart2)
-USE_JITKERNEL_GEN(kNCHW16CMulNC)
-USE_JITKERNEL_GEN(kSeqPool)
-USE_JITKERNEL_GEN(kHMax)
-USE_JITKERNEL_GEN(kHSum)
-USE_JITKERNEL_GEN(kEmbSeqPool)
-USE_JITKERNEL_GEN(kSgd)
-USE_JITKERNEL_GEN(kVBroadcast)
+USE_JITKERNEL_GEN_LITE(kMatMul)
+USE_JITKERNEL_GEN_LITE(kVMul)
+USE_JITKERNEL_GEN_LITE(kVAdd)
+USE_JITKERNEL_GEN_LITE(kVSub)
+USE_JITKERNEL_GEN_LITE(kVAddRelu)
+USE_JITKERNEL_GEN_LITE(kVScal)
+USE_JITKERNEL_GEN_LITE(kVAddBias)
+USE_JITKERNEL_GEN_LITE(kVRelu)
+USE_JITKERNEL_GEN_LITE(kVSquare)
+USE_JITKERNEL_GEN_LITE(kVIdentity)
+USE_JITKERNEL_GEN_LITE(kVExp)
+USE_JITKERNEL_GEN_LITE(kVSigmoid)
+USE_JITKERNEL_GEN_LITE(kVTanh)
+USE_JITKERNEL_GEN_LITE(kLSTMCtHt)
+USE_JITKERNEL_GEN_LITE(kLSTMC1H1)
+USE_JITKERNEL_GEN_LITE(kGRUH1)
+USE_JITKERNEL_GEN_LITE(kGRUHtPart1)
+USE_JITKERNEL_GEN_LITE(kGRUHtPart2)
+USE_JITKERNEL_GEN_LITE(kNCHW16CMulNC)
+USE_JITKERNEL_GEN_LITE(kSeqPool)
+USE_JITKERNEL_GEN_LITE(kHMax)
+USE_JITKERNEL_GEN_LITE(kHSum)
+USE_JITKERNEL_GEN_LITE(kEmbSeqPool)
+USE_JITKERNEL_GEN_LITE(kSgd)
+USE_JITKERNEL_GEN_LITE(kVBroadcast)
diff --git a/lite/backends/x86/jit/gen/act.cc b/lite/backends/x86/jit/gen/act.cc
index f1f261c199d8d25997b1ce235aa99356834e43a8..45f4f7ddcce8e8864821712698c4496cf40b618c 100644
--- a/lite/backends/x86/jit/gen/act.cc
+++ b/lite/backends/x86/jit/gen/act.cc
@@ -156,9 +156,9 @@ size_t VTanhCreator::CodeSize(const int& d) const {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator);
-REGISTER_JITKERNEL_GEN(kVSquare, gen::VSquareCreator);
-REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator);
-REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator);
-REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator);
-REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVRelu, gen::VReluCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVSquare, gen::VSquareCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVIdentity, gen::VIdentityCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVExp, gen::VExpCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVSigmoid, gen::VSigmoidCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVTanh, gen::VTanhCreator);
diff --git a/lite/backends/x86/jit/gen/blas.cc b/lite/backends/x86/jit/gen/blas.cc
index 0bddea6ace7fd338d14da918516223bb17bafdbd..37183e66404dfae139a2bcd25c2855df119f939d 100644
--- a/lite/backends/x86/jit/gen/blas.cc
+++ b/lite/backends/x86/jit/gen/blas.cc
@@ -181,10 +181,10 @@ DECLARE_BLAS_CREATOR(VAddBias);
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator);
-REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator);
-REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator);
-REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator);
-REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator);
-REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator);
-REGISTER_JITKERNEL_GEN(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVMul, gen::VMulCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVAdd, gen::VAddCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVSub, gen::VSubCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVAddRelu, gen::VAddReluCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVScal, gen::VScalCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVAddBias, gen::VAddBiasCreator);
+REGISTER_JITKERNEL_GEN_LITE(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
diff --git a/lite/backends/x86/jit/gen/embseqpool.cc b/lite/backends/x86/jit/gen/embseqpool.cc
index 2ff6894383f95699e4209215b0df3a84507a06b4..7e697014ed241a75693b783127633b255964f80b 100644
--- a/lite/backends/x86/jit/gen/embseqpool.cc
+++ b/lite/backends/x86/jit/gen/embseqpool.cc
@@ -145,4 +145,4 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator);
+REGISTER_JITKERNEL_GEN_LITE(kEmbSeqPool, gen::EmbSeqPoolCreator);
diff --git a/lite/backends/x86/jit/gen/gru.cc b/lite/backends/x86/jit/gen/gru.cc
index c5737faf134287697ef49b88f10c2590da4cc07d..4c2c57413e30589de96385c34e09733458f66b7b 100644
--- a/lite/backends/x86/jit/gen/gru.cc
+++ b/lite/backends/x86/jit/gen/gru.cc
@@ -111,6 +111,6 @@ DECLARE_GRU_CREATOR(GRUHtPart2);
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kGRUH1, gen::GRUH1Creator);
-REGISTER_JITKERNEL_GEN(kGRUHtPart1, gen::GRUHtPart1Creator);
-REGISTER_JITKERNEL_GEN(kGRUHtPart2, gen::GRUHtPart2Creator);
+REGISTER_JITKERNEL_GEN_LITE(kGRUH1, gen::GRUH1Creator);
+REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart1, gen::GRUHtPart1Creator);
+REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart2, gen::GRUHtPart2Creator);
diff --git a/lite/backends/x86/jit/gen/hopv.cc b/lite/backends/x86/jit/gen/hopv.cc
index 4304dc48c5a084a747227bd4d4aedb1cec1775cd..0fdd63a7405647860416d43a86a7a7abe9fad760 100644
--- a/lite/backends/x86/jit/gen/hopv.cc
+++ b/lite/backends/x86/jit/gen/hopv.cc
@@ -99,5 +99,5 @@ DECLARE_HOP_CREATOR(HSum);
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator);
-REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator);
+REGISTER_JITKERNEL_GEN_LITE(kHMax, gen::HMaxCreator);
+REGISTER_JITKERNEL_GEN_LITE(kHSum, gen::HSumCreator);
diff --git a/lite/backends/x86/jit/gen/lstm.cc b/lite/backends/x86/jit/gen/lstm.cc
index 44e58d0b75612238115d5771082d28c30cad55a2..e4417355202c6370563eadd80e5cb3da6af8cdc6 100644
--- a/lite/backends/x86/jit/gen/lstm.cc
+++ b/lite/backends/x86/jit/gen/lstm.cc
@@ -138,5 +138,5 @@ DECLARE_LSTM_CREATOR(LSTMC1H1);
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kLSTMCtHt, gen::LSTMCtHtCreator);
-REGISTER_JITKERNEL_GEN(kLSTMC1H1, gen::LSTMC1H1Creator);
+REGISTER_JITKERNEL_GEN_LITE(kLSTMCtHt, gen::LSTMCtHtCreator);
+REGISTER_JITKERNEL_GEN_LITE(kLSTMC1H1, gen::LSTMC1H1Creator);
diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc
index 2c75f6dd5dc4bbf12513d10ef0a4e02e709135fd..010c80fac4842e74c9b8272db472ddf6cf954771 100644
--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
@@ -130,4 +130,4 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
+REGISTER_JITKERNEL_GEN_LITE(kMatMul, gen::MatMulCreator);
diff --git a/lite/backends/x86/jit/gen/seqpool.cc b/lite/backends/x86/jit/gen/seqpool.cc
index e0cf5e5a5a7646f09666f6ccb35b18610c845317..4c80737aac4bc9cd09f4ff222c8fad8c441887ec 100644
--- a/lite/backends/x86/jit/gen/seqpool.cc
+++ b/lite/backends/x86/jit/gen/seqpool.cc
@@ -82,4 +82,4 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator);
+REGISTER_JITKERNEL_GEN_LITE(kSeqPool, gen::SeqPoolCreator);
diff --git a/lite/backends/x86/jit/gen/sgd.cc b/lite/backends/x86/jit/gen/sgd.cc
index 10659f50844d73c14403f9e7a35d800364be1e7b..44e083366132c675b339b2da4bbb3b7c1c6b7569 100644
--- a/lite/backends/x86/jit/gen/sgd.cc
+++ b/lite/backends/x86/jit/gen/sgd.cc
@@ -127,4 +127,4 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator);
+REGISTER_JITKERNEL_GEN_LITE(kSgd, gen::SgdCreator);
diff --git a/lite/backends/x86/jit/gen/vbroadcast.cc b/lite/backends/x86/jit/gen/vbroadcast.cc
index 9e02dca8c40975fb45feed1d818bbe6d3e65db19..fb1e71f7b0b1e6f68a331d264682e80fbab7c219 100644
--- a/lite/backends/x86/jit/gen/vbroadcast.cc
+++ b/lite/backends/x86/jit/gen/vbroadcast.cc
@@ -88,4 +88,4 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVBroadcast, gen::VBroadcastCreator);
diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc
index 38250d533dd8c94afc87b5f9113ea165d6b7e9ed..7d051aa6f5802844753b71fd43400e20b7f5965b 100644
--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
@@ -21,13 +21,15 @@
 // posix_memalign
 #include "lite/backends/x86/cpu_info.h"
 #include "lite/backends/x86/jit/macro.h"
+#include "lite/utils/env.h"
 #include "lite/utils/paddle_enforce.h"
 
 #ifndef _WIN32
 #define posix_memalign_free free
 #endif
 
-DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
+// DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
+bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/x86/jit/gen_base.h b/lite/backends/x86/jit/gen_base.h
index b5f942615aa001a119273b52c70116ae66e66126..4af93c2447d64e52676a60e33c01c63ba7221910 100644
--- a/lite/backends/x86/jit/gen_base.h
+++ b/lite/backends/x86/jit/gen_base.h
@@ -20,7 +20,8 @@
 #include <vector>
 #include "lite/backends/x86/jit/kernel_base.h"
 
-DECLARE_bool(dump_jitcode);
+// DECLARE_bool(dump_jitcode);
+extern bool dump_jitcode;
 
 namespace paddle {
 namespace lite {
@@ -36,7 +37,7 @@ class GenBase : public Kernel {
   template <typename Func>
   Func getCode() const {
     const unsigned char* code = this->getCodeInternal();
-    if (FLAGS_dump_jitcode) {
+    if (dump_jitcode) {
       this->dumpCode(code);
     }
     // Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
diff --git a/lite/backends/x86/jit/more/CMakeLists.txt b/lite/backends/x86/jit/more/CMakeLists.txt
index 2ddbbcd16a3ffef560581592e3a009c61844d4d5..5641466d8a86e4be7b88d7eaf977e5a58d18f085 100644
--- a/lite/backends/x86/jit/more/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/CMakeLists.txt
@@ -1,6 +1,6 @@
 
-function(USE_JITKERNEL_MORE TARGET TYPE)
-    file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n")
+function(USE_JITKERNEL_MORE_LITE TARGET TYPE)
+    file(APPEND ${jit_file} "USE_JITKERNEL_MORE_LITE(${TARGET} ${TYPE});\n")
 endfunction()
 
 # enable it latter
diff --git a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt b/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt
index 468937a4f6b27ae525bfd0d8e99cc891eedbc353..80dabc72fbe2db46359cd69760eb5a02cea615af 100644
--- a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt
@@ -5,5 +5,5 @@ cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE)
 
 # use mkl kernels by name and type
-USE_JITKERNEL_MORE(kCRFDecoding, intrinsic)
-USE_JITKERNEL_MORE(kLayerNorm, intrinsic)
+USE_JITKERNEL_MORE_LITE(kCRFDecoding, intrinsic)
+USE_JITKERNEL_MORE_LITE(kLayerNorm, intrinsic)
diff --git a/lite/backends/x86/jit/more/mix/CMakeLists.txt b/lite/backends/x86/jit/more/mix/CMakeLists.txt
index dd039d29152961210958470a48f086a133ab640c..5e0238f26f1ebbd298dba0957bdc93e16671505f 100644
--- a/lite/backends/x86/jit/more/mix/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/mix/CMakeLists.txt
@@ -5,11 +5,11 @@ cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base)
 
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE)
 
-USE_JITKERNEL_MORE(kVSigmoid, mix)
-USE_JITKERNEL_MORE(kVTanh, mix)
-USE_JITKERNEL_MORE(kLSTMCtHt, mix)
-USE_JITKERNEL_MORE(kLSTMC1H1, mix)
-USE_JITKERNEL_MORE(kGRUH1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart2, mix)
-USE_JITKERNEL_MORE(kSoftmax, mix)
+USE_JITKERNEL_MORE_LITE(kVSigmoid, mix)
+USE_JITKERNEL_MORE_LITE(kVTanh, mix)
+USE_JITKERNEL_MORE_LITE(kLSTMCtHt, mix)
+USE_JITKERNEL_MORE_LITE(kLSTMC1H1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUH1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUHtPart1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUHtPart2, mix)
+USE_JITKERNEL_MORE_LITE(kSoftmax, mix)
diff --git a/lite/backends/x86/jit/more/mkl/CMakeLists.txt b/lite/backends/x86/jit/more/mkl/CMakeLists.txt
index 56f1a62ad4e06807dace2a81156d92f6b02a14df..3557f531a561caace51225ad23e2d547ad48d08c 100644
--- a/lite/backends/x86/jit/more/mkl/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/mkl/CMakeLists.txt
@@ -3,18 +3,18 @@ cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml)
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE)
 
 # use mkl kernels by name and type
-USE_JITKERNEL_MORE(kMatMul, mkl)
-USE_JITKERNEL_MORE(kVMul, mkl)
-USE_JITKERNEL_MORE(kVAdd, mkl)
-USE_JITKERNEL_MORE(kVScal, mkl)
-USE_JITKERNEL_MORE(kStrideScal, mkl)
-USE_JITKERNEL_MORE(kVExp, mkl)
-USE_JITKERNEL_MORE(kVSquare, mkl)
-USE_JITKERNEL_MORE(kVCopy, mkl)
-USE_JITKERNEL_MORE(kVSigmoid, mkl)
-USE_JITKERNEL_MORE(kVTanh, mkl)
-USE_JITKERNEL_MORE(kSeqPool, mkl)
-USE_JITKERNEL_MORE(kSoftmax, mkl)
-USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
-USE_JITKERNEL_MORE(kSgd, mkl)
-USE_JITKERNEL_MORE(kVBroadcast, mkl)
+USE_JITKERNEL_MORE_LITE(kMatMul, mkl)
+USE_JITKERNEL_MORE_LITE(kVMul, mkl)
+USE_JITKERNEL_MORE_LITE(kVAdd, mkl)
+USE_JITKERNEL_MORE_LITE(kVScal, mkl)
+USE_JITKERNEL_MORE_LITE(kStrideScal, mkl)
+USE_JITKERNEL_MORE_LITE(kVExp, mkl)
+USE_JITKERNEL_MORE_LITE(kVSquare, mkl)
+USE_JITKERNEL_MORE_LITE(kVCopy, mkl)
+USE_JITKERNEL_MORE_LITE(kVSigmoid, mkl)
+USE_JITKERNEL_MORE_LITE(kVTanh, mkl)
+USE_JITKERNEL_MORE_LITE(kSeqPool, mkl)
+USE_JITKERNEL_MORE_LITE(kSoftmax, mkl)
+USE_JITKERNEL_MORE_LITE(kEmbSeqPool, mkl)
+USE_JITKERNEL_MORE_LITE(kSgd, mkl)
+USE_JITKERNEL_MORE_LITE(kVBroadcast, mkl)
diff --git a/lite/backends/x86/jit/more/mkl/mkl.h b/lite/backends/x86/jit/more/mkl/mkl.h
index 8b713e537e74ca2d2a2e79dad7c325cda9c0e7a4..6bc791e64575b8f481f91ea3c28ea4896fe1860d 100644
--- a/lite/backends/x86/jit/more/mkl/mkl.h
+++ b/lite/backends/x86/jit/more/mkl/mkl.h
@@ -142,14 +142,13 @@ void StrideScal(const T* a, const T* x, T* y, int n, int stride);
 // remain is the product of dimension shapes after the axis dimension
 template <typename T>
 void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
-  std::vector<T> entities(bs);
   for (int i = 0; i < bs; ++i) {
-    entities[i] = x[i * n];
+    T entity = x[i * n];
     for (int c = 1; c < n; ++c) {
-      entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i];
+      entity = x[i * n + c] > entity ? x[i * n + c] : entity;
     }
     for (int c = 0; c < n; ++c) {
-      y[i * n + c] = x[i * n + c] - entities[i];
+      y[i * n + c] = x[i * n + c] - entity;
     }
   }
   VExp(y, y, n * bs);
diff --git a/lite/backends/x86/jit/refer/CMakeLists.txt b/lite/backends/x86/jit/refer/CMakeLists.txt
index 7133f596620410d37ffe52a2ee92b7a9974bf1cc..c52b21ad7dca102d18aee25aa60079bf03ae82b9 100644
--- a/lite/backends/x86/jit/refer/CMakeLists.txt
+++ b/lite/backends/x86/jit/refer/CMakeLists.txt
@@ -2,39 +2,39 @@
 cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base)
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE)
 
-function(USE_JITKERNEL_REFER TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n")
+function(USE_JITKERNEL_REFER_LITE TARGET)
+    file(APPEND ${jit_file} "USE_JITKERNEL_REFER_LITE(${TARGET});\n")
 endfunction()
 
 # use refer kernel by name
-USE_JITKERNEL_REFER(kVMul)
-USE_JITKERNEL_REFER(kVAdd)
-USE_JITKERNEL_REFER(kVAddRelu)
-USE_JITKERNEL_REFER(kVSub)
-USE_JITKERNEL_REFER(kVScal)
-USE_JITKERNEL_REFER(kStrideScal)
-USE_JITKERNEL_REFER(kVAddBias)
-USE_JITKERNEL_REFER(kVCopy)
-USE_JITKERNEL_REFER(kVRelu)
-USE_JITKERNEL_REFER(kVIdentity)
-USE_JITKERNEL_REFER(kVExp)
-USE_JITKERNEL_REFER(kVSigmoid)
-USE_JITKERNEL_REFER(kVTanh)
-USE_JITKERNEL_REFER(kLSTMCtHt)
-USE_JITKERNEL_REFER(kLSTMC1H1)
-USE_JITKERNEL_REFER(kGRUH1)
-USE_JITKERNEL_REFER(kGRUHtPart1)
-USE_JITKERNEL_REFER(kGRUHtPart2)
-USE_JITKERNEL_REFER(kCRFDecoding)
-USE_JITKERNEL_REFER(kLayerNorm)
-USE_JITKERNEL_REFER(kNCHW16CMulNC)
-USE_JITKERNEL_REFER(kSeqPool)
-USE_JITKERNEL_REFER(kMatMul)
-USE_JITKERNEL_REFER(kVSquare)
-USE_JITKERNEL_REFER(kHSum)
-USE_JITKERNEL_REFER(kHMax)
-USE_JITKERNEL_REFER(kStrideASum)
-USE_JITKERNEL_REFER(kSoftmax)
-USE_JITKERNEL_REFER(kEmbSeqPool)
-USE_JITKERNEL_REFER(kSgd)
-USE_JITKERNEL_REFER(kVBroadcast)
+USE_JITKERNEL_REFER_LITE(kVMul)
+USE_JITKERNEL_REFER_LITE(kVAdd)
+USE_JITKERNEL_REFER_LITE(kVAddRelu)
+USE_JITKERNEL_REFER_LITE(kVSub)
+USE_JITKERNEL_REFER_LITE(kVScal)
+USE_JITKERNEL_REFER_LITE(kStrideScal)
+USE_JITKERNEL_REFER_LITE(kVAddBias)
+USE_JITKERNEL_REFER_LITE(kVCopy)
+USE_JITKERNEL_REFER_LITE(kVRelu)
+USE_JITKERNEL_REFER_LITE(kVIdentity)
+USE_JITKERNEL_REFER_LITE(kVExp)
+USE_JITKERNEL_REFER_LITE(kVSigmoid)
+USE_JITKERNEL_REFER_LITE(kVTanh)
+USE_JITKERNEL_REFER_LITE(kLSTMCtHt)
+USE_JITKERNEL_REFER_LITE(kLSTMC1H1)
+USE_JITKERNEL_REFER_LITE(kGRUH1)
+USE_JITKERNEL_REFER_LITE(kGRUHtPart1)
+USE_JITKERNEL_REFER_LITE(kGRUHtPart2)
+USE_JITKERNEL_REFER_LITE(kCRFDecoding)
+USE_JITKERNEL_REFER_LITE(kLayerNorm)
+USE_JITKERNEL_REFER_LITE(kNCHW16CMulNC)
+USE_JITKERNEL_REFER_LITE(kSeqPool)
+USE_JITKERNEL_REFER_LITE(kMatMul)
+USE_JITKERNEL_REFER_LITE(kVSquare)
+USE_JITKERNEL_REFER_LITE(kHSum)
+USE_JITKERNEL_REFER_LITE(kHMax)
+USE_JITKERNEL_REFER_LITE(kStrideASum)
+USE_JITKERNEL_REFER_LITE(kSoftmax)
+USE_JITKERNEL_REFER_LITE(kEmbSeqPool)
+USE_JITKERNEL_REFER_LITE(kSgd)
+USE_JITKERNEL_REFER_LITE(kVBroadcast)
diff --git a/lite/backends/x86/jit/refer/refer.cc b/lite/backends/x86/jit/refer/refer.cc
index e1b1240c5d5b0bc382fae8bd1b77f6c412522bdd..c47f8216abd999e66e914b208d96b8f352226f71 100644
--- a/lite/backends/x86/jit/refer/refer.cc
+++ b/lite/backends/x86/jit/refer/refer.cc
@@ -18,7 +18,7 @@
 namespace refer = paddle::lite::jit::refer;
 
 #define REGISTER_REFER_KERNEL(func) \
-  REGISTER_JITKERNEL_REFER(         \
+  REGISTER_JITKERNEL_REFER_LITE(    \
       k##func, refer::func##Kernel<float>, refer::func##Kernel<double>)
 
 REGISTER_REFER_KERNEL(VMul);
diff --git a/lite/backends/x86/jit/registry.h b/lite/backends/x86/jit/registry.h
index 7613a8dd4376045beb3636954668130e7220521e..65e3152d70fdd6262583cddced78e43513f0e0a1 100644
--- a/lite/backends/x86/jit/registry.h
+++ b/lite/backends/x86/jit/registry.h
@@ -77,16 +77,16 @@ class JitKernelRegistrar {
   void Touch() {}
 };
 
-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
+#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg)         \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
 // Refer always on CPUPlace
-#define REGISTER_JITKERNEL_REFER(kernel_type, ...)                  \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_##kernel_type##_refer_CPUPlace,               \
+#define REGISTER_JITKERNEL_REFER_LITE(kernel_type, ...)             \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                    \
+      __reg_litejitkernel_##kernel_type##_refer_CPUPlace,           \
       "REGISTER_KERNEL_REFER must be called in global namespace");  \
   static ::paddle::lite::jit::JitKernelRegistrar<                   \
       ::paddle::lite::jit::ReferKernelPool,                         \
@@ -94,84 +94,84 @@ class JitKernelRegistrar {
       __VA_ARGS__>                                                  \
       __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_(       \
           ::paddle::lite::jit::KernelType::kernel_type);            \
-  int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() {         \
+  int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_() {     \
     __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \
     return 0;                                                       \
   }
 
 // kernel_type: should be in paddle::lite::jit::KernelType
 // place_type: should be one of CPUPlace and GPUPlace in paddle::platform
-#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...)         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                   \
-      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type,             \
-      "REGISTER_KERNEL_MORE must be called in global namespace");             \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();             \
+#define REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, place_type, ...)    \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                              \
+      __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type,         \
+      "REGISTER_KERNEL_MORE_LITE must be called in global namespace");        \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();         \
   static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \
-      UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();           \
+      UNUSED = LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();       \
   static ::paddle::lite::jit::JitKernelRegistrar<                             \
       ::paddle::lite::jit::KernelPool,                                        \
       ::paddle::lite::fluid::place_type,                                      \
       __VA_ARGS__>                                                            \
       __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_(   \
           ::paddle::lite::jit::KernelType::kernel_type);                      \
-  int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() {     \
+  int LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \
     __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_      \
         .Touch();                                                             \
     return 0;                                                                 \
   }
 
 #define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \
-  REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
-
-#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \
-  REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
-
-#define REGISTER_JITKERNEL_GEN(kernel_type, ...)                    \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,                \
-      "REGISTER_JITKERNEL_GEN must be called in global namespace"); \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
-  static int __assert_gen_##kernel_type##_has_refer_ UNUSED =       \
-      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();          \
-  static ::paddle::lite::jit::JitKernelRegistrar<                   \
-      ::paddle::lite::jit::JitCodeCreatorPool,                      \
-      ::paddle::lite::fluid::CPUPlace,                              \
-      __VA_ARGS__>                                                  \
-      __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_(         \
-          ::paddle::lite::jit::KernelType::kernel_type);            \
-  int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() {           \
-    __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch();   \
-    return 0;                                                       \
+  REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
+
+#define REGISTER_GPUKERNEL_MORE_LITE(kernel_type, impl_type, ...) \
+  REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
+
+#define REGISTER_JITKERNEL_GEN_LITE(kernel_type, ...)                    \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                         \
+      __reg_litejitkernel_gen_##kernel_type##_CPUPlace_,                 \
+      "REGISTER_JITKERNEL_GEN_LITE must be called in global namespace"); \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();    \
+  static int __assert_gen_##kernel_type##_has_refer_ UNUSED =            \
+      LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();           \
+  static ::paddle::lite::jit::JitKernelRegistrar<                        \
+      ::paddle::lite::jit::JitCodeCreatorPool,                           \
+      ::paddle::lite::fluid::CPUPlace,                                   \
+      __VA_ARGS__>                                                       \
+      __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_(              \
+          ::paddle::lite::jit::KernelType::kernel_type);                 \
+  int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_() {            \
+    __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch();        \
+    return 0;                                                            \
   }
 
-#define USE_JITKERNEL_GEN(kernel_type)                            \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                       \
-      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,              \
-      "USE_JITKERNEL_GEN must be called in global namespace");    \
-  extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_();   \
-  static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
-      TouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
-
-#define USE_JITKERNEL_REFER(kernel_type)                            \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_##kernel_type##_refer_CPUPlace_,              \
-      "USE_JITKERNEL_REFER must be called in global namespace");    \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
-  static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
-      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
-
-#define USE_KERNEL_MORE(kernel_type, impl_type, place_type)              \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                              \
-      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_,     \
-      "USE_JITKERNEL_MORE must be called in global namespace");          \
-  extern int                                                             \
-      TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
-  static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \
-      UNUSED =                                                           \
-          TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
-
-#define USE_JITKERNEL_MORE(kernel_type, impl_type) \
-  USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace)
+#define USE_JITKERNEL_GEN_LITE(kernel_type)                           \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                      \
+      __reg_litejitkernel_gen_##kernel_type##_CPUPlace_,              \
+      "USE_JITKERNEL_GEN_LITE must be called in global namespace");   \
+  extern int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_();   \
+  static int use_litejitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
+      LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
+
+#define USE_JITKERNEL_REFER_LITE(kernel_type)                           \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                        \
+      __reg_litejitkernel_##kernel_type##_refer_CPUPlace_,              \
+      "USE_JITKERNEL_REFER_LITE must be called in global namespace");   \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
+  static int use_litejitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
+      LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
+
+#define USE_KERNEL_MORE_LITE(kernel_type, impl_type, place_type)             \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                             \
+      __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type##_,     \
+      "USE_JITKERNEL_MORE_LITE must be called in global namespace");         \
+  extern int                                                                 \
+      LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
+  static int use_litejitkernel_##kernel_type##_##impl_type##_##place_type##_ \
+      UNUSED =                                                               \
+          LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
+
+#define USE_JITKERNEL_MORE_LITE(kernel_type, impl_type) \
+  USE_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace)
 
 }  // namespace jit
 }  // namespace lite
diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc
index bbe35b4de5508c70496e5c8566c8d1b982a7155c..8d61fb3bbb97705c697fba934e6cab9424f85bad 100644
--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "lite/backends/x86/math/beam_search.h"
 #include <algorithm>
+#include <cmath>
 #include <map>
 #include "lite/fluid/lod.h"
 
diff --git a/lite/backends/x86/math/detail/avx_mathfun.h b/lite/backends/x86/math/detail/avx_mathfun.h
index c95c881512900efb4b39df3ba16b8de686caefcb..2ad0866d6346a24690b30d0da317c6d86e9aebba 100644
--- a/lite/backends/x86/math/detail/avx_mathfun.h
+++ b/lite/backends/x86/math/detail/avx_mathfun.h
@@ -41,9 +41,11 @@
 
   (this is the zlib license)
 */
-
+#pragma once
 #include "lite/backends/x86/cpu_info.h"
 
+namespace paddle {
+namespace lite {
 /* __m128 is ugly to write */
 typedef __m256 v8sf;   // vector of 8 float (avx)
 typedef __m256i v8si;  // vector of 8 int   (avx)
@@ -134,7 +136,7 @@ typedef union imm_xmm_union {
     return (ret);                                        \
   }
 
-//#warning "Using SSE2 to perform AVX2 bitshift ops"
+// #warning "Using SSE2 to perform AVX2 bitshift ops"
 AVX2_BITOP_USING_SSE2(slli_epi32)
 AVX2_BITOP_USING_SSE2(srli_epi32)
 
@@ -152,7 +154,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32)
     return (ret);                                                     \
   }
 
-//#warning "Using SSE2 to perform AVX2 integer ops"
+// #warning "Using SSE2 to perform AVX2 integer ops"
 AVX2_INTOP_USING_SSE2(and_si128)
 AVX2_INTOP_USING_SSE2(andnot_si128)
 AVX2_INTOP_USING_SSE2(cmpeq_epi32)
@@ -175,23 +177,23 @@ AVX2_INTOP_USING_SSE2(add_epi32)
 */
 v8sf log256_ps(v8sf x) {
   v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;  // NOLINT
 
   // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
   v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
 
-  x = _mm256_max_ps(
-      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
+  x = _mm256_max_ps(x, *(v8sf *)_ps256_min_norm_pos);  // NOLINT
+  /* cut off denormalized stuff */                     // NOLINT
 
   // can be done with AVX2
   imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
 
   /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
-  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);  // NOLINT
+  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);             // NOLINT
 
   // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);  // NOLINT
   v8sf e = _mm256_cvtepi32_ps(imm0);
 
   e = _mm256_add_ps(e, one);
@@ -203,7 +205,8 @@ v8sf log256_ps(v8sf x) {
      } else { x = x - 1.0; }
   */
   // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  v8sf mask =
+      _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);  // NOLINT
   v8sf tmp = _mm256_and_ps(x, mask);
   x = _mm256_sub_ps(x, one);
   e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
@@ -211,34 +214,34 @@ v8sf log256_ps(v8sf x) {
 
   v8sf z = _mm256_mul_ps(x, x);
 
-  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
+  v8sf y = *(v8sf *)_ps256_cephes_log_p0;  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);  // NOLINT
   y = _mm256_mul_ps(y, x);
 
   y = _mm256_mul_ps(y, z);
 
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);  // NOLINT
   y = _mm256_add_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);  // NOLINT
   y = _mm256_sub_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);  // NOLINT
   x = _mm256_add_ps(x, y);
   x = _mm256_add_ps(x, tmp);
   x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
@@ -262,14 +265,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
 v8sf exp256_ps(v8sf x) {
   v8sf tmp = _mm256_setzero_ps(), fx;
   v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;  // NOLINT
 
-  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
-  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
+  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);  // NOLINT
+  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);  // NOLINT
 
   /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
-  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
+  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);  // NOLINT
+  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);           // NOLINT
 
   /* how to perform a floorf with SSE: just below */
   // imm0 = _mm256_cvttps_epi32(fx);
@@ -283,24 +286,24 @@ v8sf exp256_ps(v8sf x) {
   mask = _mm256_and_ps(mask, one);
   fx = _mm256_sub_ps(tmp, mask);
 
-  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
-  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
+  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);     // NOLINT
+  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);  // NOLINT
   x = _mm256_sub_ps(x, tmp);
   x = _mm256_sub_ps(x, z);
 
   z = _mm256_mul_ps(x, x);
 
-  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
+  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);  // NOLINT
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);  // NOLINT
   y = _mm256_mul_ps(y, z);
   y = _mm256_add_ps(y, x);
   y = _mm256_add_ps(y, one);
@@ -308,7 +311,7 @@ v8sf exp256_ps(v8sf x) {
   /* build 2^n */
   imm0 = _mm256_cvttps_epi32(fx);
   // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);  // NOLINT
   imm0 = avx2_mm256_slli_epi32(imm0, 23);
   v8sf pow2n = _mm256_castsi256_ps(imm0);
   y = _mm256_mul_ps(y, pow2n);
@@ -349,12 +352,12 @@ v8sf sin256_ps(v8sf x) {  // any x
 
   sign_bit = x;
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);  // NOLINT
   /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);  // NOLINT
 
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);  // NOLINT
 
 /*
   Here we start a series of integer operations, which are in the
@@ -367,12 +370,12 @@ v8sf sin256_ps(v8sf x) {  // any x
   imm2 = _mm256_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);     // NOLINT
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);  // NOLINT
   y = _mm256_cvtepi32_ps(imm2);
 
   /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);  // NOLINT
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
   /* get the polynom selection mask
      there is one polynom for 0 <= x <= Pi/4
@@ -380,31 +383,31 @@ v8sf sin256_ps(v8sf x) {  // any x
 
      Both branches will be computed.
   */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);    // NOLINT
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);  // NOLINT
 #else
   /* we use SSE2 routines to perform the integer ops */
   COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);  // NOLINT
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);  // NOLINT
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);  // NOLINT
 
   COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);  // NOLINT
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);  // NOLINT
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);  // NOLINT
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -418,9 +421,9 @@ v8sf sin256_ps(v8sf x) {  // any x
 
   /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;  // NOLINT
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;  // NOLINT
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;  // NOLINT
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -429,26 +432,26 @@ v8sf sin256_ps(v8sf x) {  // any x
   x = _mm256_add_ps(x, xmm3);
 
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *(v8sf *)_ps256_coscof_p0;  // NOLINT
   v8sf z = _mm256_mul_ps(x, x);
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);  // NOLINT
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);  // NOLINT
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);  // NOLINT
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);  // NOLINT
 
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;  // NOLINT
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);  // NOLINT
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);  // NOLINT
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
@@ -475,53 +478,53 @@ v8sf cos256_ps(v8sf x) {  // any x
 #endif
 
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);  // NOLINT
 
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);  // NOLINT
 
 #ifdef __AVX2__
   /* store the integer part of y in mm0 */
   imm2 = _mm256_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);     // NOLINT
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);  // NOLINT
   y = _mm256_cvtepi32_ps(imm2);
-  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);  // NOLINT
 
   /* get the swap sign flag */
-  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);  // NOLINT
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
   /* get the polynom selection mask */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);    // NOLINT
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);  // NOLINT
 #else
 
   /* we use SSE2 routines to perform the integer ops */
   COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);  // NOLINT
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);  // NOLINT
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);  // NOLINT
 
   COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
-  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);  // NOLINT
+  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);  // NOLINT
 
-  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
+  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);  // NOLINT
+  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);  // NOLINT
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);  // NOLINT
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -534,9 +537,9 @@ v8sf cos256_ps(v8sf x) {  // any x
 
   /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;  // NOLINT
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;  // NOLINT
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;  // NOLINT
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -545,26 +548,26 @@ v8sf cos256_ps(v8sf x) {  // any x
   x = _mm256_add_ps(x, xmm3);
 
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *(v8sf *)_ps256_coscof_p0;  // NOLINT
   v8sf z = _mm256_mul_ps(x, x);
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);  // NOLINT
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);  // NOLINT
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);  // NOLINT
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);  // NOLINT
 
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;  // NOLINT
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);  // NOLINT
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);  // NOLINT
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
@@ -595,42 +598,43 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
 
   sign_bit_sin = x;
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);  // NOLINT
   /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
+  sign_bit_sin =
+      _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);  // NOLINT
 
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);  // NOLINT
 
 #ifdef __AVX2__
   /* store the integer part of y in imm2 */
   imm2 = _mm256_cvttps_epi32(y);
 
   /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);     // NOLINT
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);  // NOLINT
 
   y = _mm256_cvtepi32_ps(imm2);
   imm4 = imm2;
 
   /* get the swap sign flag for the sine */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);  // NOLINT
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
   // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
 
   /* get the polynom selection mask for the sine*/
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);    // NOLINT
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);  // NOLINT
 // v8sf poly_mask = _mm256_castsi256_ps(imm2);
 #else
   /* we use SSE2 routines to perform the integer ops */
   COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);  // NOLINT
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);  // NOLINT
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);  // NOLINT
 
   COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
@@ -638,16 +642,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   imm4_1 = imm2_1;
   imm4_2 = imm2_2;
 
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);  // NOLINT
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);  // NOLINT
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);  // NOLINT
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -659,9 +663,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
 
   /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;  // NOLINT
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;  // NOLINT
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;  // NOLINT
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -670,15 +674,15 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   x = _mm256_add_ps(x, xmm3);
 
 #ifdef __AVX2__
-  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
-  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
+  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);     // NOLINT
+  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);  // NOLINT
   imm4 = avx2_mm256_slli_epi32(imm4, 29);
 #else
-  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
-  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
+  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);  // NOLINT
+  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);  // NOLINT
 
-  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
-  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
+  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);  // NOLINT
+  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);  // NOLINT
 
   imm4_1 = _mm_slli_epi32(imm4_1, 29);
   imm4_2 = _mm_slli_epi32(imm4_2, 29);
@@ -692,25 +696,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
 
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   v8sf z = _mm256_mul_ps(x, x);
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *(v8sf *)_ps256_coscof_p0;  // NOLINT
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);  // NOLINT
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);  // NOLINT
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);  // NOLINT
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);  // NOLINT
 
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;  // NOLINT
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);  // NOLINT
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);  // NOLINT
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
@@ -729,3 +733,6 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   *s = _mm256_xor_ps(xmm1, sign_bit_sin);
   *c = _mm256_xor_ps(xmm2, sign_bit_cos);
 }
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc
index 822b7df936d84c21c226a13a48e8c09a2343f86a..a17807e8a997f0ecf908313a4cb205676e4fa4b8 100644
--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -110,11 +110,7 @@ void set_constant(const lite::Context<Target>& context,
                   lite::Tensor* tensor,
                   float value) {
   TensorSetConstantWithTarget<Target> func(context, tensor, value);
-  //#ifdef PADDLE_WITH_CUDA
-  // tensor->target().apply_visitor(func);
-  //#else
   func();
-  //#endif
 }
 
 template <typename T>
@@ -123,17 +119,19 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
                   const lite::Tensor& input,
                   const lite::Tensor& vector,
                   lite::Tensor* output) {
-    auto in_dims = input.dims();
+    const auto& in_dims = input.dims();
     auto size = input.numel() / in_dims[0];
     PADDLE_ENFORCE_EQ(vector.numel(), size);
     PADDLE_ENFORCE_EQ(output->dims(), in_dims);
 
-    auto in = lite::fluid::EigenMatrix<T>::From(input);
-    auto vec = lite::fluid::EigenVector<T>::Flatten(vector);
-    auto out = lite::fluid::EigenMatrix<T>::From(*output);
-
+    const T* input_data = input.data<T>();
+    const T* vector_data = vector.data<T>();
+    T* output_data = output->mutable_data<T>();
     for (int64_t i = 0; i < in_dims[0]; ++i) {
-      out.chip(i, 0) = in.chip(i, 0) + vec;
+      for (int64_t j = 0; j < size; ++j) {
+        output_data[i * in_dims[0] + j] =
+            input_data[i * in_dims[0] + j] + vector_data[j];
+      }
     }
   }
 };
diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc
index 9da239f9c63371350403cc0bd0eecc94eab87590..ab6c1edb481f914d5751149aca2595fee550ca51 100644
--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -49,7 +49,7 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
 
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
@@ -130,7 +130,7 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
@@ -213,7 +213,7 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
@@ -629,7 +629,7 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc
index ff215781f1efeb20a0e126a6e39a8f3508131abd..c12c05414d717dce706590a491ccae2384f3bfe5 100644
--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
@@ -24,12 +24,12 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
  public:
   void operator()(const lite::Context<lite::TargetType::kX86>& context,
                   const lite::Tensor& src,
-                  std::vector<size_t> index_lod,
+                  const std::vector<size_t>& index_lod,
                   lite::Tensor* dst,
                   bool is_src_index) {
-    size_t* index = index_lod.data();
-    auto src_dims = src.dims();
-    auto dst_dims = dst->dims();
+    const size_t* index = index_lod.data();
+    const auto& src_dims = src.dims();
+    const auto& dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(
         src_dims.size(), 2UL, "The src must be matrix with rank 2.");
     PADDLE_ENFORCE_EQ(
diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h
index a97bfaf66607e5ea2efbd6f26f311fb4cd9dab67..a70cc5bf73522f97ab312fc48553b5316dbf8376 100644
--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/eigen.h"
-// #include "lite/fluid/lod.h"
 #include "lite/utils/paddle_enforce.h"
 
 namespace paddle {
@@ -27,11 +26,6 @@ namespace lite {
 namespace x86 {
 namespace math {
 
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
-
 template <lite::TargetType Target, typename T>
 class CopyMatrixRowsFunctor {
  public:
@@ -42,7 +36,7 @@ class CopyMatrixRowsFunctor {
   // The indexed rows are based on the input index.
   void operator()(const lite::Context<Target>& context,
                   const lite::Tensor& src,
-                  std::vector<size_t> index_lod,
+                  const std::vector<size_t>& index_lod,
                   lite::Tensor* dst,
                   bool is_src_index);
 };
@@ -56,6 +50,7 @@ class LoDTensor2BatchFunctor {
   //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
   //
   struct SeqInfo {
+    SeqInfo() = default;
     SeqInfo(int start, int length, int seq_idx)
         : start(start), length(length), seq_idx(seq_idx) {}
     int start;
@@ -89,10 +84,12 @@ class LoDTensor2BatchFunctor {
 
     const auto& lod = lods[0];
 
-    std::vector<SeqInfo> seq_info;
+    std::vector<SeqInfo> seq_info(lod.size() - 1);
     for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
       int length = lod[seq_id + 1] - lod[seq_id];
-      seq_info.emplace_back(lod[seq_id], length, seq_id);
+      seq_info[seq_id].start = lod[seq_id];
+      seq_info[seq_id].length = length;
+      seq_info[seq_id].seq_idx = seq_id;
     }
 
     std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
@@ -122,21 +119,19 @@ class LoDTensor2BatchFunctor {
     // The max_seqlen represents batch size after rearranging the
     // input LodTensor. It is also the maximum length of input sequence.
 
-    lite::LoD batch_lods;
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
+    LoD* batch_lods = batch->mutable_lod();
+    batch_lods->resize(3);
 
     // batch_lods[0] is the start positions for batch LoDTensor
     int max_seqlen = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
+    batch_lods->at(0).resize(static_cast<size_t>(max_seqlen + 1));
     // batch_lods[1] is the raw index in the input LoDTensor
-    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    batch_lods->at(1).resize(static_cast<size_t>(lod_tensor.dims()[0]));
     // batch_lods[2] is the sort order for the input LoDTensor.
-    batch_lods[2].resize(seq_info.size());
+    batch_lods->at(2).resize(seq_info.size());
 
-    size_t* batch_starts = batch_lods[0].data();
-    size_t* seq2batch_idx = batch_lods[1].data();
+    size_t* batch_starts = batch_lods->at(0).data();
+    size_t* seq2batch_idx = batch_lods->at(1).data();
     batch_starts[0] = 0;
     for (int n = 0; n < max_seqlen; n++) {
       auto batch_id = static_cast<int>(batch_starts[n]);
@@ -153,14 +148,13 @@ class LoDTensor2BatchFunctor {
       }
       batch_starts[n + 1] = static_cast<size_t>(batch_id);
     }
-    size_t* seq_order = batch_lods[2].data();
+    size_t* seq_order = batch_lods->at(2).data();
     for (size_t i = 0; i < seq_info.size(); ++i) {
       seq_order[i] = seq_info[i].seq_idx;
     }
-    batch->set_lod(batch_lods);
 
     CopyMatrixRowsFunctor<Target, T> to_batch;
-    to_batch(context, lod_tensor, batch_lods[1], batch, true);
+    to_batch(context, lod_tensor, batch_lods->at(1), batch, true);
   }
 };
 
diff --git a/lite/backends/x86/math/softmax_impl.h b/lite/backends/x86/math/softmax_impl.h
index ae997a8680b9012435d80b4aa9f592a775e62e85..ec45377bc55154a4a36ebc5c3684ab7efeeef88e 100644
--- a/lite/backends/x86/math/softmax_impl.h
+++ b/lite/backends/x86/math/softmax_impl.h
@@ -99,7 +99,7 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
                   const int axis_dim,
                   const lite::Tensor* X,
                   lite::Tensor* Y) {
-    auto in_dims = X->dims();
+    const auto& in_dims = X->dims();
     constexpr int kBatchDim = 0;
     constexpr int kClassDim = 1;
 
@@ -140,7 +140,7 @@ class SoftmaxFunctor<Target, float, true, enable_if_CPU<Target>> {
                   const int axis_dim,
                   const lite::Tensor* X,
                   lite::Tensor* Y) {
-    auto in_dims = X->dims();
+    const auto& in_dims = X->dims();
     const float* in_data = X->data<float>();
     float* out_data = Y->mutable_data<float>();
     const int kBatchDim = 0;
diff --git a/lite/backends/x86/parallel.h b/lite/backends/x86/parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0689ec4c234509cee6f10f8e0f7dd432edae5c4e
--- /dev/null
+++ b/lite/backends/x86/parallel.h
@@ -0,0 +1,73 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#include "lite/backends/x86/mklml.h"
+#endif
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+
+static void SetNumThreads(int num_threads) {
+#ifdef PADDLE_WITH_MKLML
+  int real_num_threads = std::max(num_threads, 1);
+  x86::MKL_Set_Num_Threads(real_num_threads);
+  omp_set_num_threads(real_num_threads);
+#endif
+}
+
+static inline int64_t GetMaxThreads() {
+  int64_t num_threads = 1;
+#ifdef PADDLE_WITH_MKLML
+  // Do not support nested omp parallem.
+  num_threads = omp_in_parallel() ? 1 : omp_get_max_threads();
+#endif
+  return std::max(num_threads, 1L);
+}
+
+using ThreadHandler =
+    std::function<void(const int64_t begin, const int64_t end)>;
+
+static inline void RunParallelFor(const int64_t begin,
+                                  const int64_t end,
+                                  const ThreadHandler& f) {
+  if (begin >= end) {
+    return;
+  }
+
+#ifdef PADDLE_WITH_MKLML
+  int64_t num_threads = std::min(GetMaxThreads(), end - begin);
+  if (num_threads > 1) {
+#pragma omp parallel num_threads(num_threads)
+    {
+      int64_t tid = omp_get_thread_num();
+      int64_t chunk_size = (end - begin + num_threads - 1) / num_threads;
+      int64_t begin_tid = begin + tid * chunk_size;
+      f(begin_tid, std::min(end, chunk_size + begin_tid));
+    }
+    return;
+  }
+#endif
+
+  f(begin, end);
+}
+
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/CMakeLists.txt b/lite/backends/xpu/CMakeLists.txt
index f911f8e0e7c61481e1d4e309bc0635718be11206..4491fdeaefe9f16265bdee2c07ebb02b86a2b038 100644
--- a/lite/backends/xpu/CMakeLists.txt
+++ b/lite/backends/xpu/CMakeLists.txt
@@ -2,5 +2,4 @@ if(NOT LITE_WITH_XPU)
   return()
 endif()
 
-lite_cc_library(xpu_runtime SRCS runtime.cc DEPS ${xpu_runtime_libs})
-lite_cc_library(xpu_builder SRCS builder.cc DEPS ${xpu_builder_libs} xpu_runtime tensor op scope)
+lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
diff --git a/lite/backends/xpu/builder.cc b/lite/backends/xpu/builder.cc
deleted file mode 100644
index 796eaf9c46ceb3d29f1ffdc4c86ac45509f07ba1..0000000000000000000000000000000000000000
--- a/lite/backends/xpu/builder.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/xpu/builder.h"
-#include <mutex>  // NOLINT
-#include <utility>
-#include "lite/backends/xpu/runtime.h"
-
-namespace paddle {
-namespace lite {
-namespace xpu {
-
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname) {
-  auto iarg_names = op_info->input_argnames();
-  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
-      iarg_names.end()) {
-    auto inputs = op_info->Input(argname);
-    if (inputs.empty()) {
-      return false;
-    }
-    auto var_name = inputs.front();
-    auto var = scope->FindVar(var_name);
-    return var != nullptr;
-  } else {
-    return false;
-  }
-}
-
-std::string UniqueName(const std::string& prefix) {
-  static std::mutex counter_mtx;
-  static std::unordered_map<std::string, int> counter_map;
-  std::unique_lock<std::mutex> counter_lck(counter_mtx);
-  int counter = 1;
-  auto it = counter_map.find(prefix);
-  if (it == counter_map.end()) {
-    counter_map[prefix] = counter;
-  } else {
-    counter = ++(it->second);
-  }
-  return prefix + "_" + std::to_string(counter);
-}
-
-xtcl::DataType CvtPrecisionType(PrecisionType in_type) {
-  xtcl::DataType out_type = ::xtcl::Float(32);
-  switch (in_type) {
-    case PRECISION(kFloat):
-      out_type = ::xtcl::Float(32);
-      break;
-    case PRECISION(kInt8):
-      out_type = ::xtcl::Int(8);
-      break;
-    case PRECISION(kInt32):
-      out_type = ::xtcl::Int(32);
-      break;
-    default:
-      LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(in_type)
-                 << ") from Lite to XPU";
-      break;
-  }
-  return out_type;
-}
-
-DLDataType CvtDataType(PrecisionType in_type) {
-  DLDataType out_type = {kDLFloat, 32, 1};
-  switch (in_type) {
-    case PRECISION(kFloat):
-      out_type = {kDLFloat, 32, 1};
-      break;
-    case PRECISION(kInt8):
-      out_type = {kDLInt, 8, 1};
-      break;
-    case PRECISION(kInt32):
-      out_type = {kDLInt, 32, 1};
-      break;
-    default:
-      LOG(FATAL) << "Can not convert data type(" << PrecisionToStr(in_type)
-                 << ") from Lite to XPU";
-      break;
-  }
-  return out_type;
-}
-
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int>& in_shape) {
-  xtcl::Array<xtcl::xIndexExpr> out_shape;
-  for (auto dim : in_shape) {
-    out_shape.push_back(dim);
-  }
-  return out_shape;
-}
-
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape) {
-  return CvtShape(std::vector<int>(in_shape.begin(), in_shape.end()));
-}
-
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims) {
-  return CvtShape(in_dims.Vectorize());
-}
-
-std::shared_ptr<xtcl::xNDArray> CvtTensor(lite::Tensor* in_tensor,
-                                          std::vector<int64_t> out_shape,
-                                          PrecisionType in_ptype,
-                                          DataLayoutType in_ltype) {
-  uint8_t* in_data = nullptr;
-  auto in_size = in_tensor->dims().production();
-  auto in_shape = in_tensor->dims().Vectorize();
-  if (out_shape.empty()) {
-    out_shape = in_shape;
-  }
-  int in_bytes;
-  if (in_ptype == PRECISION(kFloat)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>());
-    in_bytes = in_size * sizeof(float);
-  } else if (in_ptype == PRECISION(kInt32)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>());
-    in_bytes = in_size * sizeof(int32_t);
-  } else if (in_ptype == PRECISION(kInt8)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
-    in_bytes = in_size * sizeof(int8_t);
-  } else {
-    LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype);
-  }
-  auto out_tensor = std::make_shared<xtcl::xNDArray>(
-      xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0}));
-  auto out_data =
-      reinterpret_cast<uint8_t*>(out_tensor->ToDLPack()->dl_tensor.data);
-  std::memcpy(out_data, in_data, in_bytes);
-  return out_tensor;
-}
-
-// Build the XPU subgraph to the XPU model, store the model data into the
-// weight tensor of the graph op, and the model data will be loaded again
-// by the graph computing kernel when the graph op is executed for inference.
-// Due to the lack of XPU APIs for building and outputing the model data,
-// the compiled XPU runtime object will be managed by the global variable
-// 'DeviceInfo' and the key name for finding the runtime object will be
-// stored in the weight tensor of graph op.
-// TODO(hong19860320) Compile the XPU subgraph and output the compiled model
-// data to the weight tensor of graph op.
-bool BuildModel(
-    std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
-    std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
-    std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
-    lite::Tensor* model) {
-  LOG(INFO) << "[XPU] Build Model.";
-  CHECK(builder != nullptr);
-  CHECK(outputs != nullptr);
-  CHECK_GT(outputs->size(), 0);
-  CHECK(model != nullptr);
-
-  // build graph and fill all of constant params
-  xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0]));
-  auto target = xtcl::Target::Create("llvm");
-  auto compiler = xtcl::network::xTensorCompiler(network, target);
-  compiler.SetParams(*params);  // set the data of constant tensors
-  compiler.Build();
-
-  // create and register runtime
-  auto runtime = std::make_shared<xtcl::network::xRuntimeInstance>(
-      compiler.CreateRuntimeInstance());
-  if (runtime == nullptr) {
-    LOG(WARNING) << "[XPU] Build Model failed!";
-    return false;
-  }
-  std::string name = UniqueName("xpu");
-  LOG(INFO) << "[XPU] Model Name: " << name;
-  DeviceInfo::Global().Insert(name, runtime);
-  model->Resize({static_cast<int64_t>(name.length() + 1)});
-  memcpy(model->mutable_data<int8_t>(),
-         reinterpret_cast<const int8_t*>(name.c_str()),
-         name.length() + 1);
-  return true;
-}
-
-}  // namespace xpu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/xpu/device.cc b/lite/backends/xpu/device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..badde878ad870bfc5fcd1984e39923174a11e9e2
--- /dev/null
+++ b/lite/backends/xpu/device.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/device.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+
+std::unique_ptr<xtcl::network::xRuntimeInstance> Device::Build(
+    xtcl::network::xNetworkBuilder* builder,
+    xtcl::network::xTensorCompiler::ParamNDArrayMap* params,
+    std::vector<xtcl::xExpr*>* outputs) {
+  VLOG(3) << "[XPU] Build model";
+  CHECK(builder != nullptr);
+  CHECK(outputs != nullptr);
+  CHECK_GT(outputs->size(), 0);
+
+  // The XPU compiler build the graph and fill all of the constant params, and
+  // use TupleNode to support multiple outputs
+  xtcl::Array<xtcl::xExpr> all_outs;
+  for (size_t i = 0; i < outputs->size(); i++) {
+    all_outs.push_back(*outputs->at(i));
+  }
+  xtcl::xNetwork network =
+      builder->FinalizeNetwork(xtcl::relay::TupleNode::make(all_outs));
+  auto target = xtcl::NullValue<xtcl::Target>();
+  if (!target_.empty()) {
+    target = xtcl::Target::Create(target_);
+  }
+  xtcl::network::xTensorCompiler compiler(network, target);
+  compiler.SetParams(*params);  // Set the data of constant tensors
+  compiler.Build();
+  VLOG(3) << "[XPU] Build done";
+  return std::unique_ptr<xtcl::network::xRuntimeInstance>(
+      new xtcl::network::xRuntimeInstance(compiler.CreateRuntimeInstance()));
+}
+
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/device.h b/lite/backends/xpu/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..6de18d5466da6e6b791363d2e275ea72376c78b8
--- /dev/null
+++ b/lite/backends/xpu/device.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <xtcl/xtcl.h>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {
+    char* name = std::getenv("XPU_DEVICE_NAME");
+    if (name) {
+      name_ = std::string(name);
+    }
+    // XPU_DEVICE_TARGET for XPU model building, which supports 'llvm' and 'xpu
+    // -libs=xdnn'
+    char* target = std::getenv("XPU_DEVICE_TARGET");
+    if (target) {
+      target_ = std::string(target);
+    }
+  }
+
+  // Build the XPU graph to the XPU runtime, return the XPU runtime which can be
+  // used to run inference.
+  std::unique_ptr<xtcl::network::xRuntimeInstance> Build(
+      xtcl::network::xNetworkBuilder* builder,
+      xtcl::network::xTensorCompiler::ParamNDArrayMap* params,
+      std::vector<xtcl::xExpr*>* outputs);
+
+  const std::string name() const { return name_; }
+  const std::string target() const { return target_; }
+
+ private:
+  std::string name_{""};
+  std::string target_{""};
+};
+
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/runtime.cc b/lite/backends/xpu/runtime.cc
deleted file mode 100644
index a2c34b95758e8abf81c8294507d0ca60aad7c021..0000000000000000000000000000000000000000
--- a/lite/backends/xpu/runtime.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/xpu/runtime.h"
-#include <vector>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace xpu {
-
-// Extract the model data and recover the XPU model for inference, the function
-// is called by the graph computing kernel when the graph op is executed.
-// Due to the lack of XPU APIs for loading and recovering the XPU model from
-// memory, the key name is obtained from the weight tensor of graph op, to get
-// the runtime object for inference from the global variable 'DeviceInfo'.
-// TODO(hong19860320) Recover the XPU model from the weight tensor of graph op.
-bool LoadModel(const lite::Tensor &model,
-               std::shared_ptr<xtcl::network::xRuntimeInstance> *runtime) {
-  LOG(INFO) << "[XPU] Load Model.";
-  CHECK_GT(model.dims().production(), 0);
-  std::string name(reinterpret_cast<const char *>(model.data<int8_t>()));
-  LOG(INFO) << "[XPU] Model Name: " << name;
-  CHECK(runtime != nullptr);
-  *runtime = DeviceInfo::Global().Find(name);
-  if (*runtime == nullptr) {
-    LOG(WARNING) << "[XPU] Load Model failed!";
-    return false;
-  }
-  return true;
-}
-
-}  // namespace xpu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/xpu/runtime.h b/lite/backends/xpu/runtime.h
deleted file mode 100644
index 4ff8d75bce6156d51a4988d427058da34460443f..0000000000000000000000000000000000000000
--- a/lite/backends/xpu/runtime.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <xtcl/xtcl.h>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace xpu {
-
-class DeviceInfo {
- public:
-  static DeviceInfo& Global() {
-    static DeviceInfo x;
-    return x;
-  }
-  DeviceInfo() {}
-
-  void Insert(const std::string& name,
-              std::shared_ptr<xtcl::network::xRuntimeInstance> runtime) {
-    if (runtimes_.find(name) != runtimes_.end()) {
-      LOG(WARNING) << "[XPU] Model " << name << " already exists.";
-      return;
-    }
-    runtimes_.emplace(std::make_pair(name, runtime));
-  }
-
-  void Clear() { runtimes_.clear(); }
-
-  std::shared_ptr<xtcl::network::xRuntimeInstance> Find(
-      const std::string& name) const {
-    if (runtimes_.find(name) != runtimes_.end()) {
-      return runtimes_.at(name);
-    } else {
-      return nullptr;
-    }
-  }
-
- private:
-  int device_id_{0};
-  std::string device_name_{"default"};
-  std::unordered_map<std::string,
-                     std::shared_ptr<xtcl::network::xRuntimeInstance>>
-      runtimes_;
-};
-
-bool LoadModel(const lite::Tensor& model,
-               std::shared_ptr<xtcl::network::xRuntimeInstance>* runtime);
-
-}  // namespace xpu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index 5eecf1d815d30fe0ef10a55c6b6b351795fe63ae..1d0558451fce67433d966d1f4bff82af26459e33 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -6,7 +6,8 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
   X86_DEPS target_wrapper_x86
   CUDA_DEPS target_wrapper_cuda
   CL_DEPS cl_target_wrapper
-  FPGA_DEPS fpga_target_wrapper)
+  FPGA_DEPS fpga_target_wrapper
+  BM_DEPS target_wrapper_bm)
 
 lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)
 
@@ -33,9 +34,9 @@ lite_cc_library(scope SRCS scope.cc DEPS tensor)
 lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
 
 if (LITE_WITH_ARM)
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags NPU_DEPS npu_runtime)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags)
 else()
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags XPU_DEPS xpu_runtime)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags)
 endif()
 
 #-------------------------------------------- GET CODE META INFO ------------------------------------------
@@ -95,11 +96,19 @@ add_custom_command(
 add_custom_target(op_list_h DEPENDS ops.h)
 add_custom_target(kernel_list_h DEPENDS kernels.h)
 add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
-
+# create headfile to restore ops info sorted by suppported platforms
+add_custom_command(
+  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py
+  ${kernels_src_list}
+  ${ops_src_list}
+  ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h
+  OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time.
+  )
+  add_custom_target(supported_kernel_op_info_h DEPENDS supported_kernel_op_info.h)
 #----------------------------------------------- NOT CHANGE -----------------------------------------------
 lite_cc_library(kernel SRCS kernel.cc
         DEPS context type_system target_wrapper any op_params tensor
-        PROFILE_DEPS basic_profiler
+        PROFILE_DEPS lite_profiler
   )
 lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel
   cpp_op_desc tensor
@@ -113,7 +122,7 @@ lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper)
 
 lite_cc_library(program SRCS program.cc
     DEPS op kernel model_parser ${ops} ${cpp_wrapper}
-    PROFILE_DEPS basic_profiler)
+    PROFILE_DEPS lite_profiler)
 
 if (NOT LITE_ON_TINY_PUBLISH)
   lite_cc_library(optimizer SRCS optimizer.cc DEPS mir_pass_manager model_parser program)
diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt
index bc77afd81e0859b9492b2068ce681098a9393923..0f3f36768bd5a079564002cbb6464d61bd5db3aa 100644
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -5,6 +5,6 @@ endif()
 
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
index c59c078787b9a6778227ba6ba51230d1fc2104cb..fe36f1e1ba16ad85c44136b09a0d2e5d3fadf688 100644
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -14,13 +14,38 @@
 
 #include "lite/core/arena/framework.h"
 #include "lite/core/context.h"
+#include "lite/operators/subgraph_op.h"
 
 namespace paddle {
 namespace lite {
 namespace arena {
 
 void TestCase::CreateInstruction() {
-  auto op = LiteOpRegistry::Global().Create(op_desc().Type());
+  std::shared_ptr<lite::OpLite> op = nullptr;
+  if (place_.target == TARGET(kNPU) || place_.target == TARGET(kXPU)) {
+    // Create a new block desc to wrap the original op desc
+    int sub_block_idx = 0;
+    auto sub_block_desc = new cpp::BlockDesc();
+    sub_block_desc->ClearOps();
+    sub_block_desc->ClearVars();
+    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_block_op_desc = *op_desc_;
+    // Add the block desc into the subgraph op which used to replace the
+    // original op
+    op_desc_.reset(new cpp::OpDesc());
+    op_desc_->SetType("subgraph");
+    op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
+    auto in_names = sub_block_op_desc->input_vars();
+    auto out_names = sub_block_op_desc->output_vars();
+    op_desc_->SetInput("Inputs", in_names);
+    op_desc_->SetOutput("Outputs", out_names);
+    op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
+    op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
+    op = LiteOpRegistry::Global().Create(op_desc().Type());
+    static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
+  } else {
+    op = LiteOpRegistry::Global().Create(op_desc().Type());
+  }
   CHECK(op) << "no op for " << op_desc().Type();
   op->Attach(*op_desc_, inst_scope_);
   auto kernels = op->CreateKernels({place_});
@@ -37,6 +62,9 @@ void TestCase::CreateInstruction() {
   // prepare context
   (*it)->SetContext(std::move(ctx_));
   instruction_.reset(new Instruction(op, std::move(*it)));
+#ifdef LITE_WITH_PROFILE
+  instruction_->set_profiler(new profile::Profiler());
+#endif
 }
 
 void TestCase::PrepareInputsForInstruction() {
@@ -65,6 +93,19 @@ void TestCase::PrepareInputsForInstruction() {
   }
 }
 
+TestCase::~TestCase() {
+  if (op_desc_->Type() == "subgraph") {
+    // Release the subblock desc of Subgraph op
+    auto subgraph_op = const_cast<operators::SubgraphOp*>(
+        static_cast<const operators::SubgraphOp*>(instruction_->op()));
+    CHECK(subgraph_op);
+    auto sub_block_desc = subgraph_op->GetSubBlock();
+    if (sub_block_desc) {
+      delete sub_block_desc;
+    }
+  }
+}
+
 }  // namespace arena
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h
index 412ac0c167b8abe6d196dc25d1bc5b193d02965d..85edda26e6591bada967165317de00b169a2d0cd 100644
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -21,6 +21,7 @@
 #include <iomanip>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "lite/core/op_registry.h"
@@ -42,7 +43,7 @@ class TestCase {
       : place_(place), scope_(new Scope), alias_(alias) {
     ctx_ = ContextScheduler::Global().NewContext(place_.target);
   }
-  virtual ~TestCase() {}
+  virtual ~TestCase();
 
   void Prepare() {
     PrepareScopes();
@@ -77,6 +78,20 @@ class TestCase {
   // kernel registry.
   void CheckKernelConsistWithDefinition() {}
 
+  // Get the real precision of the output for check precision. When the declare
+  // precision obtained from the kernel is any, we should set the precision of
+  // the output in test case.
+  bool GetPrecisonType(const std::string& var_name,
+                       PrecisionType* precision_type) {
+    auto res = precision_type_map_.find(var_name);
+    if (res == precision_type_map_.end()) {
+      return false;
+    } else {
+      *precision_type = precision_type_map_.at(var_name);
+      return true;
+    }
+  }
+
   Scope& scope() { return *scope_; }
 
   Scope* baseline_scope() { return base_scope_; }
@@ -92,7 +107,8 @@ class TestCase {
   void SetCommonTensor(const std::string& var_name,
                        const DDim& ddim,
                        const T* data,
-                       const LoD& lod = {}) {
+                       const LoD& lod = {},
+                       bool is_persistable = false) {
     auto* tensor = scope_->NewTensor(var_name);
     tensor->Resize(ddim);
     auto* d = tensor->mutable_data<T>();
@@ -100,11 +116,26 @@ class TestCase {
 
     // set lod
     if (!lod.empty()) *tensor->mutable_lod() = lod;
+    // set persistable
+    tensor->set_persistable(is_persistable);
   }
 
   // Prepare for the operator.
   virtual void PrepareOpDesc(cpp::OpDesc* op_desc) = 0;
 
+  // Set the real precision of the output for check precision. When the declare
+  // precision obtained from the kernel is any, we should set the precision of
+  // the output in test case.
+  void SetPrecisionType(const std::string& var_name,
+                        const PrecisionType& precision_type) {
+    auto res = precision_type_map_.find(var_name);
+    if (res == precision_type_map_.end()) {
+      precision_type_map_.insert({var_name, precision_type});
+    } else {
+      precision_type_map_.at(var_name) = precision_type;
+    }
+  }
+
  public:
   const Instruction& instruction() { return *instruction_; }
 
@@ -148,6 +179,7 @@ class TestCase {
   Scope* base_scope_{};
   std::unique_ptr<cpp::OpDesc> op_desc_;
   std::unique_ptr<Instruction> instruction_;
+  std::unordered_map<std::string, PrecisionType> precision_type_map_;
 };
 
 class Arena {
@@ -159,13 +191,17 @@ class Arena {
     tester_->Prepare();
   }
 
-  bool TestPrecision() {
+  bool TestPrecision(const std::vector<std::string>& exclude_outs = {}) {
     tester_->RunBaseline(tester_->baseline_scope());
     tester_->RunInstruction();
 
     bool success = true;
     for (auto& out : tester_->op_desc().OutputArgumentNames()) {
       for (auto& var : tester_->op_desc().Output(out)) {
+        if (std::find(exclude_outs.begin(), exclude_outs.end(), var) !=
+            exclude_outs.end()) {
+          continue;
+        }
         success = success && CompareTensor(out, var);
       }
     }
@@ -180,7 +216,17 @@ class Arena {
     }
     auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
         std::chrono::high_resolution_clock::now() - timer);
-    LOG(INFO) << "average duration: " << duration.count() << " ms";
+
+    timer = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < times; i++) {
+      tester_->RunBaseline(tester_->baseline_scope());
+    }
+    auto duration_basic = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::high_resolution_clock::now() - timer);
+    LOG(INFO) << "average lite duration: " << duration.count() << " ms";
+    LOG(INFO) << "average basic duration: " << duration_basic.count() << " ms";
+    LOG(INFO) << "speed up ratio: lite_speed / basic_speed: "
+              << static_cast<float>(duration_basic.count()) / duration.count();
   }
 
  private:
@@ -189,8 +235,11 @@ class Arena {
     // get tensor type.
     const Type* type =
         tester_->instruction().kernel()->GetOutputDeclType(arg_name);
-
-    switch (type->precision()) {
+    auto precision_type = type->precision();
+    if (precision_type == PRECISION(kAny)) {
+      CHECK(tester_->GetPrecisonType(var_name, &precision_type));
+    }
+    switch (precision_type) {
       case PRECISION(kFloat):
         return tester_->CheckPrecision<float>(var_name, abs_error_);
       case PRECISION(kInt8):
@@ -199,7 +248,6 @@ class Arena {
         return tester_->CheckPrecision<int32_t>(var_name, abs_error_);
       case PRECISION(kBool):
         return tester_->CheckPrecision<bool>(var_name, abs_error_);
-
       default:
         LOG(FATAL) << "not support type " << PrecisionToStr(type->precision());
         return false;
diff --git a/lite/core/context.h b/lite/core/context.h
index 545c6d2e8804f72a0bde854f9e5ae82c80b2b53c..653329e4f24b1f391ea41ed39819b60c8a598a3b 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -25,12 +25,6 @@
 #include "lite/backends/opencl/cl_context.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
-#ifdef LITE_WITH_NPU
-#include "lite/backends/npu/runtime.h"
-#endif
-#ifdef LITE_WITH_XPU
-#include "lite/backends/xpu/runtime.h"
-#endif
 
 #include <map>
 #include <memory>
@@ -61,6 +55,7 @@ using NPUContext = Context<TargetType::kNPU>;
 using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
+using BMContext = Context<TargetType::kBM>;
 
 template <>
 class Context<TargetType::kHost> {
@@ -88,12 +83,29 @@ class Context<TargetType::kNPU> {
 };
 #endif
 
+#ifdef LITE_WITH_BM
+template <>
+class Context<TargetType::kBM> {
+ public:
+  Context() {}
+  explicit Context(const BMContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() { Init(0); }
+
+  void Init(int dev_id) { TargetWrapperBM::SetDevice(dev_id); }
+  void CopySharedTo(BMContext* ctx) {}
+  void* GetHandle() { return TargetWrapperBM::GetHandle(); }
+
+  std::string name() const { return "BMContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_XPU
 template <>
 class Context<TargetType::kXPU> {
  public:
   Context() {}
-  explicit Context(const NPUContext& ctx);
+  explicit Context(const XPUContext& ctx);
   // NOTE: InitOnce should only be used by ContextScheduler
   void InitOnce() {}
   void CopySharedTo(XPUContext* ctx) {}
@@ -207,13 +219,6 @@ class Context<TargetType::kCUDA> {
     ctx->cublas_fp32_ = cublas_fp32_;
   }
 
-  CUDAContext& operator=(const CUDAContext& context) {
-    this->Init(
-        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
-    this->cublas_fp32_ = context.cublas_fp32_;
-    return *this;
-  }
-
   const cudaStream_t& exec_stream() const { return exec_stream_; }
   void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
 
@@ -387,6 +392,12 @@ class ContextScheduler {
         kernel_contexts_[TargetType::kFPGA].As<FPGAContext>().CopySharedTo(
             &ctx->As<FPGAContext>());
         break;
+#endif
+#ifdef LITE_WITH_BM
+      case TARGET(kBM):
+        kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
+            &ctx->As<BMContext>());
+        break;
 #endif
       default:
 #ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
@@ -425,6 +436,9 @@ class ContextScheduler {
 #endif
 #ifdef LITE_WITH_XPU
     InitContext<TargetType::kXPU, XPUContext>();
+#endif
+#ifdef LITE_WITH_BM
+    InitContext<TargetType::kBM, BMContext>();
 #endif
   }
 
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index f5b757ac3ccd6310f6a6fd9fe6483d28ff7adbc6..6e0d743fb9d8d8af5e7168e292c1e85d76844383 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -79,7 +79,7 @@ const int DEFAULT_L3_CACHE_SIZE = 0;
 int get_cpu_num() {
 #ifdef LITE_WITH_LINUX
   // get cpu count from /sys/devices/system/cpu/cpunum/uevent
-  int max_cpu_num = 20;
+  int max_cpu_num = 128;
   int cpu_num = 0;
   for (int i = 0; i < max_cpu_num; ++i) {
     char path[256];
@@ -227,19 +227,24 @@ void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
 #ifdef LITE_WITH_LINUX
 
 std::string get_cpu_name() {
-  std::string cpu_name;
+  std::string cpu_name = "";
   FILE* fp = fopen("/proc/cpuinfo", "rb");
   if (!fp) {
     return "";
   }
   char line[1024];
+  bool first_model_name = true;
   while (!feof(fp)) {
     char* s = fgets(line, 1024, fp);
     if (!s) {
       break;
     }
     if (strstr(line, "Hardware") != NULL) {
-      cpu_name = std::string(line);
+      cpu_name += std::string(line);
+    }
+    if (strstr(line, "model name") != NULL && first_model_name) {
+      cpu_name += std::string(line);
+      first_model_name = false;
     }
   }
 #ifdef LITE_WITH_ANDROID
@@ -816,6 +821,21 @@ bool DeviceInfo::SetCPUInfoByName() {
     SetFP16Info(1, 1);
     SetDotInfo(1, 1);
     return true;
+  } else if (dev_name_.find("FT2000PLUS") != std::string::npos) {
+    core_num_ = 64;
+    core_ids_.resize(core_num_);
+    big_core_ids_.resize(core_num_);
+    cluster_ids_.resize(core_num_);
+    for (int i = 0; i < core_num_; ++i) {
+      core_ids_[i] = i;
+      big_core_ids_[i] = i;
+      cluster_ids_[i] = 0;
+    }
+    little_core_ids_ = {};
+    SetCacheInfo(0, 1, 64 * 1024);
+    SetCacheInfo(1, 1, 32 * 1024 * 1024);
+    SetCacheInfo(2, 1, 128 * 1024 * 1024);
+    return true;
   }
   return false;
 }
diff --git a/lite/core/framework.proto b/lite/core/framework.proto
index 5adf2a18b98c2a2d3e2f6e8f7dd5688150674dc6..84b5502ff7b369452e7c9988d185450934c78b03 100644
--- a/lite/core/framework.proto
+++ b/lite/core/framework.proto
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
 package paddle.framework.proto;
 
 // Any incompatible changes to ProgramDesc and its dependencies should
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
index 05d7a6b333810a8dc988d84a281f096babe8929f..18a1243c11652afc181f13f0f5a497858a30885f 100644
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -31,7 +31,7 @@
 #include "lite/utils/replace_stl/stream.h"
 
 #ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
+#include "lite/core/profile/profiler.h"
 #endif  // LITE_WITH_PROFILE
 
 namespace paddle {
@@ -58,7 +58,10 @@ class KernelBase {
   virtual void Run() = 0;
 
 #ifdef LITE_WITH_PROFILE
-  void SetProfileID(uint32_t id) { profile_id_ = id; }
+  void SetProfiler(profile::Profiler* profiler, int id) {
+    profiler_ = profiler;
+    profile_id_ = id;
+  }
 #endif
 
   void Launch() {
@@ -80,12 +83,11 @@ class KernelBase {
 #if defined(LITE_WITH_CUDA)
     WorkSpace::Global_CUDA().AllocReset();
 #endif
-
 #ifdef LITE_WITH_PROFILE
-    if (profile_id_ >= 0) {
-      profile::ProfileBlock x(profile_id_, "kernel");
-      Run();
-    }
+    profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
+    profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
+    Run();
+    profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
 #else
     Run();
 #endif
@@ -175,6 +177,7 @@ class KernelBase {
   bool is_first_epoch_{true};
 
 #ifdef LITE_WITH_PROFILE
+  profile::Profiler* profiler_{nullptr};
   int profile_id_{-1};
 #endif
 };
diff --git a/lite/core/lite.map b/lite/core/lite.map
index 31adae42196c3d6b82628a2e433b13a4cb467b39..9cfd272eb6d3017a75b40481d25527d7c14478bf 100644
--- a/lite/core/lite.map
+++ b/lite/core/lite.map
@@ -1,6 +1,8 @@
 {
     global:
         *paddle*;
+        *touch_*;
+        *mir_pass_*;
     local:
         *;
 };
diff --git a/lite/core/memory.cc b/lite/core/memory.cc
index b3cb18b33630de6615812471e1acaab59c8e99b0..cfb0b3ae1765864200ecf2d70107a3aa0046899c 100644
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -40,6 +40,11 @@ void* TargetMalloc(TargetType target, size_t size) {
       data = TargetWrapper<TARGET(kFPGA)>::Malloc(size);
       break;
 #endif  // LITE_WITH_OPENCL
+#ifdef LITE_WITH_BM
+    case TargetType::kBM:
+      data = TargetWrapper<TARGET(kBM)>::Malloc(size);
+      break;
+#endif
     default:
       LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
   }
@@ -69,6 +74,11 @@ void TargetFree(TargetType target, void* data) {
       TargetWrapper<TARGET(kFPGA)>::Free(data);
       break;
 #endif  // LITE_WITH_CUDA
+#ifdef LITE_WITH_BM
+    case TargetType::kBM:
+      TargetWrapper<TARGET(kBM)>::Free(data);
+      break;
+#endif
     default:
       LOG(FATAL) << "Unknown type";
   }
@@ -95,6 +105,11 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
           dst, src, size, IoDirection::DtoD);
       break;
 #endif
+#ifdef LITE_WITH_BM
+    case TargetType::kBM:
+      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
     case TargetType::kOpenCL:
       TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
diff --git a/lite/core/memory.h b/lite/core/memory.h
index cb4ac044e7af6994e5e404f379eeb12290e34778..051d47bdde102f5fe058163d0c746fe3c4acf26e 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -25,6 +25,10 @@
 #include "lite/backends/cuda/target_wrapper.h"
 #endif  // LITE_WITH_CUDA
 
+#ifdef LITE_WITH_BM
+#include "lite/backends/bm/target_wrapper.h"
+#endif  // LITE_WITH_BM
+
 namespace paddle {
 namespace lite {
 
@@ -71,6 +75,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
     case TARGET(kFPGA):
       TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);
       break;
+#endif
+#ifdef LITE_WITH_BM
+    case TARGET(kBM):
+      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
+      break;
 #endif
   }
 }
@@ -100,13 +109,14 @@ class Buffer {
   template <typename T>
   void ResetLazyImage2D(TargetType target,
                         const size_t img_w,
-                        const size_t img_h) {
+                        const size_t img_h,
+                        void* host_ptr = nullptr) {
     size_t size = sizeof(T) * img_w * img_h *
                   4;  // 4 for RGBA, un-used for opencl Image2D
     if (target != target_ || cl_image2d_width_ < img_w ||
         cl_image2d_height_ < img_h) {
       Free();
-      data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h);
+      data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);
       target_ = target;
       space_ = size;  // un-used for opencl Image2D
       cl_image2d_width_ = img_w;
@@ -119,6 +129,7 @@ class Buffer {
     if (space_ > 0) {
       TargetFree(target_, data_);
     }
+    data_ = nullptr;
     target_ = TargetType::kHost;
     space_ = 0;
   }
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index a44b8348716449519486d37f6784e31ecc39f554..379ef67f2996519d0c8007d8f191efbd2166a9e3 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -16,10 +16,13 @@ lite_cc_library(mir_passes
       fusion/interpolate_fuse_pass.cc
       fusion/conv_elementwise_fuse_pass.cc
       fusion/conv_activation_fuse_pass.cc
+      fusion/var_conv_2d_activation_fuse_pass.cc
       fusion/conv_bn_fuse_pass.cc
       fusion/elementwise_add_activation_fuse_pass.cc
       fusion/quant_dequant_fuse_pass.cc
+      fusion/sequence_pool_concat_fuse_pass.cc
       elimination/identity_scale_eliminate_pass.cc
+      elimination/elementwise_mul_constant_eliminate_pass.cc
       static_kernel_pick_pass.cc
       variable_place_inference_pass.cc
       type_target_cast_pass.cc
@@ -32,7 +35,8 @@ lite_cc_library(mir_passes
       demo_pass.cc
       runtime_context_assign_pass.cc
       memory_optimize_pass.cc
-  DEPS mir_pass types context ${mir_fusers} ${subgraph_passes})
+      weight_quantization_preprocess_pass.cc
+  DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
 
 # lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
         #mir_ssa_graph scope op
diff --git a/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc b/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..863c01ef0646794b5cbe54d7a81a8f26dbf164ae
--- /dev/null
+++ b/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+namespace {
+
+class ElementwiseMulConstantEliminator : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* pre_op = OpNode("preop");    // the previous op's output need update
+    auto* post_op = OpNode("postop");  // the post op's output need update
+    // TODO(Superjomn) check has only one output
+    auto* x =
+        VarNode("x")->assert_is_op_input("elementwise_mul", "X")->AsOutput();
+    auto* y = VarNode("Y")->assert_is_op_input("elementwise_mul", "Y");
+
+    // create op nodes
+    auto* mul = OpNode("mul", "elementwise_mul")
+                    ->assert_is_op("elementwise_mul")
+                    ->AsIntermediate();
+
+    auto* fill_constant = OpNode("fill_constant", "fill_constant")
+                              ->assert_is_op("fill_constant")
+                              ->assert_op_attr<float>("value", 1.)
+                              ->AsIntermediate();
+    // create output node
+    auto* mul_out =
+        VarNode("output")->assert_is_op_output("elementwise_mul", "Out");
+    // create topology.
+    std::vector<PMNode*> add_inputs{x, y};
+    *pre_op >> *x;
+    *fill_constant >> *y;
+    add_inputs >> *mul >> *mul_out;
+    *mul_out >> *post_op;
+
+    // The pre_op will be eliminated, and a new output-updated op will insert.
+    mul_out->AsIntermediate();  // mul_out is pre_op's output, need to update
+    y->AsIntermediate();        // need to update
+  }
+
+ private:
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto& post_op = matched.at("postop")->AsStmt();
+    auto op_info = *post_op.op_info();
+
+    op_info.UpdateAllInputs(matched.at("output")->AsArg().name,
+                            matched.at("x")->AsArg().name);
+    post_op.ResetOp(op_info, graph->valid_places());
+
+    IR_NODE_LINK_TO(matched.at("x"), matched.at("postop"));
+  }
+};
+
+}  // namespace
+
+class ElementwiseMulConstantEliminatePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    ElementwiseMulConstantEliminator eliminator;
+    eliminator(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(elementwise_mul_constant_eliminate_pass,
+                  paddle::lite::mir::ElementwiseMulConstantEliminatePass)
+    .BindTargets({TARGET(kAny)});
diff --git a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc b/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
index acea48c742522d5b6b5f1f3b570fcbfe0c4be08d..345361047bbbad68cdd0b298a163214cbfe114fc 100644
--- a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
+++ b/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
@@ -25,7 +25,8 @@ namespace {
 class Eliminator : public FuseBase {
  public:
   void BuildPattern() override {
-    auto* pre_op = OpNode("preop");  // the previous op's output need update
+    // the previous op's output need updat
+    auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block");
     // TODO(Superjomn) check has only one output
     auto* x = VarNode("x")->assert_is_op_input("scale", "X");
     auto* scale_op = OpNode("scale", "scale")
diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt
index 5ac52837551f0b78d67dfe1733fe354ee2cf7f01..e65e72cf7b367ee8477f3f783ae4d82372529864 100644
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -10,6 +10,9 @@ lite_cc_library(fuse_conv_elementwise
 lite_cc_library(fuse_conv_activation
         SRCS conv_activation_fuser.cc
         DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_var_conv_activation
+        SRCS var_conv_2d_activation_fuser.cc
+        DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_conv_bn
         SRCS conv_bn_fuser.cc
         DEPS pattern_matcher_high_api)
@@ -25,17 +28,22 @@ lite_cc_library(fuse_transpose_softmax_transpose
 lite_cc_library(fuse_interpolate
         SRCS interpolate_fuser.cc
         DEPS pattern_matcher_high_api)       
+lite_cc_library(fuse_sequence_pool_concat
+        SRCS sequence_pool_concat_fuser.cc
+        DEPS pattern_matcher_high_api)       
 
 set(mir_fusers
     fuse_fc
     fuse_shuffle_channel
     fuse_conv_elementwise
     fuse_conv_activation
+    fuse_var_conv_activation
     fuse_conv_bn
     fuse_quant_dequant
     fuse_elementwise_add_activation
     fuse_transpose_softmax_transpose
     fuse_interpolate
+    fuse_sequence_pool_concat
     CACHE INTERNAL "fusers")
 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
index ff064fb2ee93fc540e932da36fb07bb78eef989a..b688bbc1083a6ab0f521381c4a988a12badc3141 100644
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
@@ -29,8 +29,13 @@ void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       act_types.push_back("leaky_relu");
       break;
     }
+    if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
+      act_types.push_back("relu6");
+      act_types.push_back("leaky_relu");
+      break;
+    }
   }
-  for (auto conv_type : {"conv2d", "depthwise_conv2d"}) {
+  for (auto conv_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) {
     for (auto act_type : act_types) {
       for (auto has_bias : {true, false}) {
         fusion::ConvActivationFuser fuser(conv_type, act_type, has_bias);
@@ -47,4 +52,5 @@ void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(lite_conv_activation_fuse_pass,
                   paddle::lite::mir::ConvActivationFusePass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)})
     .BindKernel("conv2d");
diff --git a/lite/core/mir/fusion/conv_activation_fuser.cc b/lite/core/mir/fusion/conv_activation_fuser.cc
index 6ba11a6a4e82416eb386ec3b34c71183cef5adcb..993fe4e9441824d0c5539e6555e5e12d87e5b98f 100644
--- a/lite/core/mir/fusion/conv_activation_fuser.cc
+++ b/lite/core/mir/fusion/conv_activation_fuser.cc
@@ -79,6 +79,9 @@ cpp::OpDesc ConvActivationFuser::GenOpDesc(const key2nodes_t& matched) {
   op_desc.SetAttr("act_type", act_type_);
   if (act_type_ == "relu") {
     op_desc.SetAttr("fuse_relu", true);
+  } else if (act_type_ == "relu6") {
+    float alpha = act_op_desc.GetAttr<float>("threshold");
+    op_desc.SetAttr("fuse_brelu_threshold", alpha);
   } else if (act_type_ == "leaky_relu") {
     float alpha = act_op_desc.GetAttr<float>("alpha");
     op_desc.SetAttr("leaky_relu_alpha", alpha);
diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
index d9d9c1bbf55bd33c31aa9a22de934d4eae8657c6..f5a7837b53650e08f9632b499a4c2ab1faeaeedf 100644
--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
@@ -27,7 +27,6 @@ void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // initialze fuser params
   std::vector<bool> conv_has_bias_cases{true, false};
   std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
-
   // start fuse using params
   for (auto conv_has_bias : conv_has_bias_cases) {
     for (auto conv_type : conv_type_cases) {
@@ -45,4 +44,4 @@ void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass)
     .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kX86)});
+    .ExcludeTargets({TARGET(kX86), TARGET(kXPU), TARGET(kBM)});
diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc
index ec07278eed1f259c45e225497f94d682b544c57c..0f5bb64e10dd61c3edf4ddd32569a2d365651cdf 100644
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -100,14 +100,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   auto eps = matched.at("bn")->stmt()->op_info()->GetAttr<float>("epsilon");
 
   // conv
-  auto conv_weight_t = scope->FindVar(matched.at("conv_weight")->arg()->name)
-                           ->GetMutable<lite::Tensor>();
+  std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
+  auto conv_weight_t =
+      scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
   CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
            static_cast<size_t>(conv_weight_t->dims()[0]))
       << "The BN bias's size should be equal to the size of the first "
       << "dim size of the conv weights";
   size_t weight_num = conv_weight_t->data_size();
   bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
+  bool is_weight_quantization =
+      conv_op_desc->HasAttr("quantize_weight_bits") ? true : false;
 
   // comupte BN alpha and beta
   Tensor alpha_tensor, beta_tensor;
@@ -160,6 +163,16 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
       }
     }
     conv_op_desc->SetAttr("weight_scale", weight_scale);
+  } else if (is_weight_quantization) {
+    std::string scale_name = conv_weight_name + "_quant_scale";
+    if (conv_op_desc->HasAttr(scale_name)) {
+      auto scale = conv_op_desc->GetAttr<std::vector<float>>(scale_name);
+      CHECK_EQ(scale.size(), alpha_tensor.numel());
+      for (size_t i = 0; i < scale.size(); i++) {
+        scale[i] *= alpha_data[i];
+      }
+      conv_op_desc->SetAttr(scale_name, scale);
+    }
   } else {
     // compute new conv_weight
     auto conv_weight_d = conv_weight_t->mutable_data<float>();
diff --git a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
index fd9aadc5d01c2cb3b6c7a3e888503072a0798725..2021bdd3482663b823dd6c1dabdb11be5b5617e2 100644
--- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
@@ -46,4 +46,5 @@ void ConvElementwiseFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_conv_elementwise_fuse_pass,
                   paddle::lite::mir::ConvElementwiseFusePass)
-    .BindTargets({TARGET(kAny)});
+    .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU), TARGET(kBM)});
diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
index af66f5ab66bd09907cb9d28f00f17d983e54c252..1c2297710b7cf41dc1adb7cde30d9fcfb61c79f0 100644
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
@@ -35,4 +35,7 @@ void ElementwiseAddActivationFusePass::Apply(
 REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass,
                   paddle::lite::mir::ElementwiseAddActivationFusePass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)})
+    .ExcludeTargets({TARGET(kBM)})
+    .ExcludeTargets({TARGET(kX86)})
     .BindKernel("fusion_elementwise_add_activation");
diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc
index ed10f06f5651f4000485279d682689101d80aa5a..46695be396596c2ce9b74bb771326171fc7b374b 100644
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -23,8 +23,13 @@ namespace lite {
 namespace mir {
 
 void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::FcFuser fuser;
+#ifdef LITE_WITH_X86
+  fusion::FcFuser fuser(true);
   fuser(graph.get());
+#endif
+
+  fusion::FcFuser fuser2(false);
+  fuser2(graph.get());
 }
 
 }  // namespace mir
@@ -33,4 +38,7 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)})
+    .ExcludeTargets({TARGET(kBM)})
+    .ExcludeTargets({TARGET(kCUDA)})
     .BindKernel("fc");
diff --git a/lite/core/mir/fusion/fc_fuse_pass_test.cc b/lite/core/mir/fusion/fc_fuse_pass_test.cc
index f7aa4bb5adcb848531ecc3a8f63bace1c2e3e0ff..54260732c5efe788f0d3740197253fa2321a7d02 100644
--- a/lite/core/mir/fusion/fc_fuse_pass_test.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass_test.cc
@@ -88,6 +88,7 @@ USE_LITE_OP(mul);
 USE_LITE_OP(elementwise_add);
 USE_LITE_OP(elementwise_sub);
 USE_LITE_OP(fc);
+USE_LITE_OP(relu);
 USE_LITE_OP(feed);
 USE_LITE_OP(fetch);
 USE_LITE_OP(io_copy);
diff --git a/lite/core/mir/fusion/fc_fuser.cc b/lite/core/mir/fusion/fc_fuser.cc
index 460c0fdf7a4309638b9852a315ca0efda02801ab..3c99131083d37ea2c8511ed136bff17c891529af 100644
--- a/lite/core/mir/fusion/fc_fuser.cc
+++ b/lite/core/mir/fusion/fc_fuser.cc
@@ -35,12 +35,23 @@ void FcFuser::BuildPattern() {
   std::vector<PMNode*> mul_inputs{W, x};
   std::vector<PMNode*> add_inputs{mul_out, b};
   mul_inputs >> *mul >> *mul_out;
-  add_inputs >> *add >> *Out;
 
   // Some op specialities.
   mul_out->AsIntermediate();
   mul->AsIntermediate();
   add->AsIntermediate();
+
+  if (with_relu_) {
+    auto* add_out = VarNode("add_out");
+    auto* relu = OpNode("relu", "relu");
+    std::vector<PMNode*> relu_inputs{add_out};
+    add_inputs >> *add >> *add_out;
+    relu_inputs >> *relu >> *Out;
+    add_out->AsIntermediate();
+    relu->AsIntermediate();
+  } else {
+    add_inputs >> *add >> *Out;
+  }
 }
 
 void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
@@ -71,6 +82,9 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
   op_desc.SetAttr(
       "in_num_col_dims",
       matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
+  if (with_relu_) {
+    op_desc.SetAttr("activation_type", std::string{"relu"});
+  }
   return op_desc;
 }
 
diff --git a/lite/core/mir/fusion/fc_fuser.h b/lite/core/mir/fusion/fc_fuser.h
index 7ba07527898c7e648c5f7f9151642ab0928fa496..6cb08f41574b67df1c78fa296d2d395771a66ee1 100644
--- a/lite/core/mir/fusion/fc_fuser.h
+++ b/lite/core/mir/fusion/fc_fuser.h
@@ -25,11 +25,13 @@ namespace fusion {
 
 class FcFuser : public FuseBase {
  public:
+  explicit FcFuser(bool with_relu) : with_relu_(with_relu) {}
   void BuildPattern() override;
   void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
 
  private:
   cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  bool with_relu_;
 };
 
 }  // namespace fusion
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index f823f45dc66f8ef6cc67cbb9b0d9860c86ec9340..da611e4490f4ba7268d9011b3dbb391a63a88305 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -396,6 +396,8 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
     op_desc->SetAttr<float>("input_scale", scale_value);
     op_desc->SetInput("X", {input_act_node->arg()->name});
     IR_NODE_LINK_TO(input_act_node, quantized_node)
+    auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
+    quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
 
     // delete nodes and edges
     std::unordered_set<const Node*> nodes2rm = {input_scale_node,
@@ -440,6 +442,8 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
     op_desc->SetInput("Y", {input_act_right_node->arg()->name});
     IR_NODE_LINK_TO(input_act_left_node, quantized_node)
     IR_NODE_LINK_TO(input_act_right_node, quantized_node)
+    auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
+    quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
 
     // delete nodes and edges
     std::unordered_set<const Node*> nodes2rm = {input_scale_left_node,
diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c3b44ca12a1a9ad76720a9363533b9a20dd0999
--- /dev/null
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/sequence_pool_concat_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void SequencePoolConcatFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  fusion::SequencePoolConcatFuser fuser;
+  fuser(graph.get());
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_sequence_pool_concat_fuse_pass,
+                  paddle::lite::mir::SequencePoolConcatFusePass)
+    .BindTargets({TARGET(kCUDA)});
diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..38f931f502430211c1e51de5e9f81af9e43462c8
--- /dev/null
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class SequencePoolConcatFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuser.cc b/lite/core/mir/fusion/sequence_pool_concat_fuser.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1c22aee86505a0d8e3f32b263cbbd9521504e6a
--- /dev/null
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuser.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/sequence_pool_concat_fuser.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+// """
+// merge {sequence_pool x 7, concat} => merge_sequence_pool_and_concat
+//   src1              src2               src7            src1    src2      src7
+//     |                |                                  |       |         |
+//     v                v                                  |       |   ...   |
+// sequence_pool  sequence_pool  ...(sequence_pool)        |       |         |
+//     |                |              |              =>   -------------------
+//     ---------------------------------                          |
+//             |                                                  |
+//             v                                                  v
+//           concat                                     sequence_pool_concat
+// """
+void SequencePoolConcatFuser::BuildPattern() {
+  // create nodes.
+  auto* concat = OpNode("concat", "concat")->AsIntermediate();
+
+#define STR1(R) #R
+#define STR2(R) STR1(R)
+
+#define POOL_CONCAT_PATTERN(num)                                            \
+  auto* x_##num = VarNode(STR2(sequence_pool_x_##num))                      \
+                      ->assert_is_op_input("sequence_pool", "X")            \
+                      ->AsInput();                                          \
+  auto* sequence_pool_##num =                                               \
+      OpNode(STR2(sequence_pool_##num), "sequence_pool")->AsIntermediate(); \
+  auto* sequence_pool_##num##_out =                                         \
+      VarNode(STR2(sequence_pool_##num##_out))                              \
+          ->assert_is_op_output("sequence_pool", "Out")                     \
+          ->assert_is_op_nth_input("concat", "X", num - 1)                  \
+          ->AsIntermediate();                                               \
+  auto* sequence_pool_##num##_idx =                                         \
+      VarNode(STR2(sequence_pool_##num##_idx))                              \
+          ->assert_is_op_output("sequence_pool", "MaxIndex")                \
+          ->AsIntermediate();                                               \
+  *sequence_pool_##num >> *sequence_pool_##num##_idx;                       \
+  *x_##num >> *sequence_pool_##num >> *sequence_pool_##num##_out >> *concat;
+
+  auto* concat_out =
+      VarNode("concat_out")->assert_is_op_output("concat", "Out");
+  *concat >> *concat_out;
+
+  POOL_CONCAT_PATTERN(1);
+  POOL_CONCAT_PATTERN(2);
+  POOL_CONCAT_PATTERN(3);
+  POOL_CONCAT_PATTERN(4);
+  POOL_CONCAT_PATTERN(5);
+  POOL_CONCAT_PATTERN(6);
+  POOL_CONCAT_PATTERN(7);
+
+#undef POOL_CONCAT_PATTERN
+#undef STR1
+#undef STR2
+}
+
+void SequencePoolConcatFuser::InsertNewNode(SSAGraph* graph,
+                                            const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto sequence_pool_concat_op =
+      LiteOpRegistry::Global().Create("sequence_pool_concat");
+
+  auto concat = matched.at("concat")->stmt()->op();
+  auto* scope = concat->scope();
+  auto& valid_places = concat->valid_places();
+  sequence_pool_concat_op->Attach(op_desc, scope);
+
+  auto* new_op_node =
+      graph->GraphCreateInstructNode(sequence_pool_concat_op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_1"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_2"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_3"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_4"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_5"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_6"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_7"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("concat_out"));
+}
+
+cpp::OpDesc SequencePoolConcatFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc = *matched.at("concat")->stmt()->op_info();
+  op_desc.SetType("sequence_pool_concat");
+  op_desc.SetInput("X",
+                   {matched.at("sequence_pool_x_1")->arg()->name,
+                    matched.at("sequence_pool_x_2")->arg()->name,
+                    matched.at("sequence_pool_x_3")->arg()->name,
+                    matched.at("sequence_pool_x_4")->arg()->name,
+                    matched.at("sequence_pool_x_5")->arg()->name,
+                    matched.at("sequence_pool_x_6")->arg()->name,
+                    matched.at("sequence_pool_x_7")->arg()->name});
+
+  std::vector<std::string> pooltypes;
+  pooltypes.push_back(matched.at("sequence_pool_1")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_2")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_3")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_4")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_5")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_6")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_7")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  op_desc.SetAttr("pooltype", pooltypes);
+
+  op_desc.SetOutput("Out", {matched.at("concat_out")->arg()->name});
+
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuser.h b/lite/core/mir/fusion/sequence_pool_concat_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8f731becd4a19554ddc347db7cca4bb6fd66ee9
--- /dev/null
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuser.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class SequencePoolConcatFuser : public FuseBase {
+ public:
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ce2248cbc23d8887a22f94c14b2507fb0cacbed
--- /dev/null
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/var_conv_2d_activation_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void VarConv2dActivationFusePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  std::vector<std::string> act_types{"relu"};
+  for (auto act_type : act_types) {
+    fusion::VarConvActivationFuser fuser(act_type, "var_conv_2d");
+    fuser(graph.get());
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_var_conv_2d_activation_fuse_pass,
+                  paddle::lite::mir::VarConv2dActivationFusePass)
+    .BindTargets({TARGET(kCUDA)});
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..7616aadef340d3e4d6bc11534dd839c91fe9ed1d
--- /dev/null
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class VarConv2dActivationFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc b/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eabd97ae4513b84c9c002aa1587d45cce6b22e21
--- /dev/null
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/var_conv_2d_activation_fuser.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void VarConvActivationFuser::BuildPattern() {
+  // create nodes.
+  auto* input = VarNode("X")->assert_is_op_input(conv_type_, "X")->AsInput();
+  auto* filter = VarNode("W")->assert_is_op_input(conv_type_, "W")->AsInput();
+
+  auto* conv2d = OpNode("var_conv_2d", conv_type_)->AsIntermediate();
+
+  auto* act = OpNode("act", act_type_)->AsIntermediate();
+
+  auto* conv2d_out = VarNode("conv2d_out")
+                         ->assert_is_op_output(conv_type_, "Out")
+                         ->assert_is_op_input(act_type_, "X")
+                         ->AsIntermediate();
+  auto* conv2d_out_1 = VarNode("conv2d_out_1")
+                           ->assert_is_op_output(conv_type_, "Col")
+                           ->AsIntermediate();
+
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
+
+  // create topology.
+  std::vector<PMNode*> conv2d_inputs{filter, input};
+  conv2d_inputs >> *conv2d >> *conv2d_out >> *act >> *out;
+  *conv2d >> *conv2d_out_1;
+}
+
+void VarConvActivationFuser::InsertNewNode(SSAGraph* graph,
+                                           const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto conv_op = LiteOpRegistry::Global().Create(conv_type_);
+  auto conv_old = matched.at("var_conv_2d")->stmt()->op();
+  auto* scope = conv_old->scope();
+  auto& valid_places = conv_old->valid_places();
+  conv_op->Attach(op_desc, scope);
+
+  auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("X"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
+}
+
+cpp::OpDesc VarConvActivationFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc = *matched.at("var_conv_2d")->stmt()->op_info();
+  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
+  cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
+
+  if (act_type_ == "relu") {
+    op_desc.SetAttr("fuse_relu", true);
+  }
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuser.h b/lite/core/mir/fusion/var_conv_2d_activation_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..68bc89f7d13d38dc07814f3296a25bfd7dea0248
--- /dev/null
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuser.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class VarConvActivationFuser : public FuseBase {
+ public:
+  explicit VarConvActivationFuser(const std::string& act_type,
+                                  const std::string& conv_type)
+      : act_type_(act_type), conv_type_(conv_type) {}
+
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string act_type_;
+  std::string conv_type_;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc
index 76ea9555c29a245aa9f20b158f0706557940bef8..3a27360f94d7d828e1c19214d621f1dfe4e048ca 100644
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -36,15 +36,6 @@ std::string Visualize(mir::SSAGraph* graph) {
 
   int id = 0;
   std::set<std::string> exists_args;
-  std::map<int, std::string> graph_col;  // Different colors of subgraphs
-  graph_col.insert({{1, "red"},
-                    {2, "green"},
-                    {3, "cyan"},
-                    {4, "bisque3"},
-                    {5, "coral"},
-                    {6, "darkseagreen1"},
-                    {7, "goldenrod1"},
-                    {8, "darkorchid"}});
   for (auto& node : graph->mutable_nodes()) {
     std::string key;
     if (node.IsArg()) {
@@ -52,24 +43,12 @@ std::string Visualize(mir::SSAGraph* graph) {
     } else {
       key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++);
     }
-
     if (node.IsStmt()) {
-      auto& stmt = node.AsStmt();
-      auto sub_id = stmt.subgraph_id();
-      auto it = graph_col.find(sub_id);
-      if (sub_id > 0 && it != graph_col.end()) {
-        dot.AddNode(key,
-                    {Dot::Attr("shape", "box"),
-                     Dot::Attr("style", "filled"),
-                     Dot::Attr("color", "black"),
-                     Dot::Attr("fillcolor", it->second)});
-      } else {
-        dot.AddNode(key,
-                    {Dot::Attr("shape", "box"),
-                     Dot::Attr("style", "filled"),
-                     Dot::Attr("color", "black"),
-                     Dot::Attr("fillcolor", "yellow")});
-      }
+      dot.AddNode(key,
+                  {Dot::Attr("shape", "box"),
+                   Dot::Attr("style", "filled"),
+                   Dot::Attr("color", "black"),
+                   Dot::Attr("fillcolor", "yellow")});
       for (auto& x : node.inlinks) {
         auto name = x->AsArg().name;
         if (!exists_args.count(name)) {
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index 1f2355e8a3205cce3410bd2cb6ac4a17d8fde602..6256a49a99b9097664c192d40502daf506437a31 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -50,7 +50,7 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
                                         "lod_reset",
                                         "concat",
                                         "yolo_box",
-                                        "graph_op",
+                                        "subgraph",
                                         "feed",
                                         "fetch"};
     for (auto* tmp : node->inlinks) {
@@ -255,4 +255,5 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }  // namespace paddle
 
 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
-    .BindTargets({TARGET(kARM)});
+    .BindTargets({TARGET(kARM)})
+    .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
diff --git a/lite/core/mir/node.cc b/lite/core/mir/node.cc
index 4a90e530a46c4d42d2ba032da1828973dfc1bcef..52fd39182a7132777231929d49c319bb961cf7f9 100644
--- a/lite/core/mir/node.cc
+++ b/lite/core/mir/node.cc
@@ -53,6 +53,11 @@ void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc,
   }
   valid_kernels_ = op_->CreateKernels(valid_places);
 }
+void mir::Node::Stmt::ResetKernels(const std::vector<Place> &valid_places) {
+  CHECK(op_) << "change valid place failed, not created op";
+  valid_kernels_.clear();
+  valid_kernels_ = op_->CreateKernels(valid_places);
+}
 
 mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) {
   auto &x = AsArg();
diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h
index 60fa1fb1ebe49e1be38a7d84cb82545389ea4aac..e7c44d2be689a9d890158c097e198314413d1ba3 100644
--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -53,6 +53,7 @@ class Node {
                  const std::vector<Place>& valid_places,
                  lite::Scope* scope = nullptr);
 
+    void ResetKernels(const std::vector<Place>& valid_places);
     std::string op_type() const { return op_info()->Type(); }
     const OpInfo* op_info() const;
     OpInfo* mutable_op_info();
@@ -64,9 +65,6 @@ class Node {
       return valid_kernels_;
     }
 
-    void ClearSubgraphID() { subgraph_id_ = -1 /* note: not 0 */; }
-    void SetSubgraphID(int id) { subgraph_id_ = id; }
-    int subgraph_id() const { return subgraph_id_; }
     void SetOp(const std::shared_ptr<OpLite>& op) { op_ = op; }
     const std::shared_ptr<OpLite> op() const { return op_; }
 
@@ -82,11 +80,6 @@ class Node {
 
     // Description.
     std::string desc;
-
-   protected:
-    // -1 means not in subgraph, 0 means supported but not one id, id started
-    // from 1
-    int subgraph_id_{-1};
   };
 
   struct Arg {
diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h
index 4de0fdbf357160348a403d3c8527fe62891237f0..4e8c8be292bbd5e7f46664378634d4f1aeed2965 100644
--- a/lite/core/mir/pass.h
+++ b/lite/core/mir/pass.h
@@ -52,34 +52,44 @@ class Pass {
 
   // Bind targets. At runtime, there must be one device in the bound targets.
   void BindTargets(const std::set<TargetType>& targets) {
-    std::set<TargetType> res;
     for (const auto& target : targets) {
       const std::set<TargetType>& universe = ExpandValidTargets(target);
       std::set_union(bound_targets_.begin(),
                      bound_targets_.end(),
                      universe.begin(),
                      universe.end(),
-                     std::inserter(res, res.begin()));
+                     std::inserter(bound_targets_, bound_targets_.begin()));
     }
-    bound_targets_ = res;
   }
 
   // Exclude targets. At runtime, there must be one device in the bound targets.
+  // Disable the pass if one of the valid devices is in the excluded targets.
   void ExcludeTargets(const std::set<TargetType>& targets) {
-    std::set<TargetType> res;
     for (const auto& target : targets) {
       const std::set<TargetType>& universe = ExpandValidTargets(target);
-      std::set_difference(bound_targets_.begin(),
-                          bound_targets_.end(),
-                          universe.begin(),
-                          universe.end(),
-                          std::inserter(res, res.begin()));
+      std::set<TargetType> updated_bound_targets;
+      std::set_difference(
+          bound_targets_.begin(),
+          bound_targets_.end(),
+          universe.begin(),
+          universe.end(),
+          std::inserter(updated_bound_targets, updated_bound_targets.begin()));
+      bound_targets_ = updated_bound_targets;
+      std::set_union(
+          excluded_targets_.begin(),
+          excluded_targets_.end(),
+          universe.begin(),
+          universe.end(),
+          std::inserter(excluded_targets_, excluded_targets_.begin()));
     }
-    bound_targets_ = res;
   }
 
   // Get all bound targets.
-  const std::set<TargetType>& Targets() const { return bound_targets_; }
+  const std::set<TargetType>& BoundTargets() const { return bound_targets_; }
+  // Get all excluded targets.
+  const std::set<TargetType>& ExcludedTargets() const {
+    return excluded_targets_;
+  }
 
   // Some passes are only available on qualified kernels and need to be
   // explicitly declared.
@@ -116,6 +126,7 @@ class Pass {
   std::string name_;
   std::string doc_;
   std::set<TargetType> bound_targets_;
+  std::set<TargetType> excluded_targets_;
   std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_;
 };
 
diff --git a/lite/core/mir/pass_utils.cc b/lite/core/mir/pass_utils.cc
index 4f6be2c186d2d940a799201812cce397a9e94eb4..5bddfcbd3c17288546dc6e0a0b4ebf984d26c504 100644
--- a/lite/core/mir/pass_utils.cc
+++ b/lite/core/mir/pass_utils.cc
@@ -47,10 +47,34 @@ bool KernelRegistered(const std::string name, const Place& place) {
   return false;
 }
 
-bool PassMatchesTarget(const mir::Pass& pass, TargetType target) {
-  const auto& targets = pass.Targets();
-  if (targets.find(TARGET(kAny)) != targets.end()) return true;
-  return (targets.find(target) != targets.end());
+bool PassMatchesTarget(const mir::Pass& pass,
+                       const std::set<TargetType>& targets) {
+  // Whether the pass is suitable for targets ? The condition is the
+  // intersection of targets and pass's bound targets is not empty, besides the
+  // intersection of targets and pass's excluded targets is empty. The formula
+  // is as follows: matched = !empty(targets ^ pass.bound_targets) &&
+  // empty(targets ^ pass.excluded_targets), where ^ is intersection operation.
+  const auto& bound_targets = pass.BoundTargets();
+  bool matched = bound_targets.find(TARGET(kAny)) != bound_targets.end();
+  std::set<TargetType> inter_bound_targets;
+  std::set_intersection(
+      bound_targets.begin(),
+      bound_targets.end(),
+      targets.begin(),
+      targets.end(),
+      std::inserter(inter_bound_targets, inter_bound_targets.begin()));
+  matched |= !inter_bound_targets.empty();
+  const auto& excluded_targets = pass.ExcludedTargets();
+  matched &= excluded_targets.find(TARGET(kAny)) == excluded_targets.end();
+  std::set<TargetType> inter_excluded_targets;
+  std::set_intersection(
+      excluded_targets.begin(),
+      excluded_targets.end(),
+      targets.begin(),
+      targets.end(),
+      std::inserter(inter_excluded_targets, inter_excluded_targets.begin()));
+  matched &= inter_excluded_targets.empty();
+  return matched;
 }
 
 bool PassMatchesKernels(const mir::Pass& pass) {
diff --git a/lite/core/mir/pass_utils.h b/lite/core/mir/pass_utils.h
index 942f64bf3190be1f399ac6f014be0881b1450d9b..57e8da5e461f40bd79ece8139c3290e17e762996 100644
--- a/lite/core/mir/pass_utils.h
+++ b/lite/core/mir/pass_utils.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <set>
 #include <string>
 #include "lite/core/mir/pass.h"
 
@@ -24,7 +25,8 @@ namespace lite {
 bool KernelRegistered(const std::string name, const Place& place);
 
 // Check if the pass hits the hardware target.
-bool PassMatchesTarget(const mir::Pass& pass, TargetType target);
+bool PassMatchesTarget(const mir::Pass& pass,
+                       const std::set<TargetType>& targets);
 
 // Check if the pass hits all necessary operators.
 bool PassMatchesKernels(const mir::Pass& pass);
diff --git a/lite/core/mir/pattern_matcher.cc b/lite/core/mir/pattern_matcher.cc
index 8e0fc55be2389244ae065b4c2809bbdd74be370c..b625919cbfb6d26ecbbd1bad36772aff86bee087 100644
--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
@@ -377,6 +377,19 @@ PMNode *PMNode::assert_is_op(const std::string &op_type) {
   return this;
 }
 
+PMNode *PMNode::assert_is_not_op_type(const std::string &op_type) {
+  asserts_.emplace_back([op_type](const Node *x) {
+    if (x && x->IsStmt()) {
+      auto *op_info = x->stmt()->op_info();
+      if (op_info->Type() == op_type) {
+        return false;
+      }
+    }
+    return true;
+  });
+  return this;
+}
+
 PMNode *PMNode::assert_is_var() {
   asserts_.emplace_back([](const Node *x) { return x && x->IsArg(); });
   return this;
diff --git a/lite/core/mir/pattern_matcher.h b/lite/core/mir/pattern_matcher.h
index 47a0a30b5667ddc97b3783ab9edbab04281528a4..90c4359c6d3ade98cf60b5c23411e2026cdeccc9 100644
--- a/lite/core/mir/pattern_matcher.h
+++ b/lite/core/mir/pattern_matcher.h
@@ -123,6 +123,7 @@ struct PMNode {
   // Assertions, helper functions to simplify the pattern definition.
   PMNode* assert_is_op();
   PMNode* assert_is_op(const std::string& op_type);
+  PMNode* assert_is_not_op_type(const std::string& op_type);
   PMNode* assert_is_var();
   PMNode* assert_var_not_persistable();
   PMNode* assert_is_persistable_var();
diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc
index 8f22022789046900c3c09cfb122c914968d8d87f..2b5b65ce5903ede41137311c585c0e87eaaa0e9d 100644
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -123,6 +123,9 @@ void SSAGraph::Build(const Program &program,
     return true;
   };
 
+  std::unordered_map<std::string, PrecisionType> var_types =
+      program.var_data_type();
+
   std::unordered_map<std::string, mir::Node *> arg_update_node_map_;
   for (auto &op : program.ops()) {
     VLOG(3) << op->op_info()->Type();
@@ -137,6 +140,10 @@ void SSAGraph::Build(const Program &program,
         arg_node->AsArg(name, node_storage_.size() - 1);
         arg_update_node_map_[name] = arg_node;
       }
+      if (var_types.count(name) && !arg_node->arg()->type) {
+        arg_node->arg()->type = LiteType::GetTensorTy(
+            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+      }
       if (is_weights(name)) arg_node->AsArg().is_weight = true;
       CHECK(arg_node->IsRoleSet());
       DirectedLink(arg_node, op_node);
@@ -146,6 +153,10 @@ void SSAGraph::Build(const Program &program,
       auto *arg_node = &node_storage_.back();
       arg_node->AsArg(name, node_storage_.size() - 1);
       arg_update_node_map_[name] = arg_node;
+      if (var_types.count(name) && !arg_node->arg()->type) {
+        arg_node->arg()->type = LiteType::GetTensorTy(
+            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+      }
 
       if (is_weights(name)) arg_node->AsArg().is_weight = true;
       CHECK(arg_node->IsRoleSet());
diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc
index 90aca56aec426f6b7ca0d300ded979ae7b10f6df..1cc8942d611db389a44cbf6a244775a5b666b587 100644
--- a/lite/core/mir/static_kernel_pick_pass.cc
+++ b/lite/core/mir/static_kernel_pick_pass.cc
@@ -14,7 +14,10 @@
 
 #include "lite/core/mir/static_kernel_pick_pass.h"
 #include <algorithm>
+#include <list>
 #include <memory>
+#include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
@@ -43,13 +46,33 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     if (!node.IsStmt()) continue;
     auto& instruct = node.AsStmt();
 
+    std::unordered_map<std::string, PrecisionType> in_types;
+    std::unordered_map<std::string, PrecisionType> out_types;
+    for (std::list<Node*>::iterator i = node.inlinks.begin();
+         i != node.inlinks.end();
+         ++i) {
+      if ((*i)->arg()->type)
+        in_types[(*i)->arg()->name] = (*i)->arg()->type->precision();
+    }
+    for (std::list<Node*>::iterator i = node.outlinks.begin();
+         i != node.outlinks.end();
+         ++i) {
+      if ((*i)->arg()->type)
+        out_types[(*i)->arg()->name] = (*i)->arg()->type->precision();
+    }
     // Get candidate kernels
     std::vector<std::pair<float, std::unique_ptr<KernelBase>>> scored;
     CHECK(!instruct.kernels().empty()) << "No kernels found for "
                                        << instruct.op_type();
     VLOG(4) << "instruct.kernels().size():" << instruct.kernels().size();
     for (auto&& kernel : instruct.kernels()) {
-      float score = KernelGrade(*kernel, graph->valid_places());
+      float score = KernelGrade(instruct,
+                                *kernel,
+                                graph->valid_places(),
+                                in_types,
+                                out_types,
+                                instruct.op_info()->input_names(),
+                                instruct.op_info()->output_names());
       VLOG(4) << "kernel->summary():" << kernel->summary()
               << " score:" << score;
       scored.emplace_back(score, std::move(kernel));
@@ -99,7 +122,13 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
         instruct.ResetOp(update_desc, graph->valid_places());
         scored.clear();
         for (auto&& kernel : instruct.kernels()) {
-          float score = KernelGrade(*kernel, graph->valid_places());
+          float score = KernelGrade(instruct,
+                                    *kernel,
+                                    graph->valid_places(),
+                                    in_types,
+                                    out_types,
+                                    instruct.op_info()->input_names(),
+                                    instruct.op_info()->output_names());
           scored.emplace_back(score, std::move(kernel));
         }
         std::sort(scored.begin(), scored.end(), KernelScoreCmp);
diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h
index 90be0ea54e8761e2e68b12a396dde0df1bba3f26..f655b298bf2d800f4adf142ad14b8ac05ca00482 100644
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -16,6 +16,8 @@
 
 #include <limits>
 #include <memory>
+#include <string>
+#include <unordered_map>
 #include <vector>
 #include "lite/core/mir/pass.h"
 #include "lite/core/types.h"
@@ -48,8 +50,14 @@ class StaticKernelPickPass : public mir::StmtPass {
 
  private:
   // Score the kernel.
-  size_t KernelGrade(const lite::KernelBase& kernel,
-                     const std::vector<Place>& places) {
+  size_t KernelGrade(
+      const lite::mir::Node::Stmt& instruct,
+      const lite::KernelBase& kernel,
+      const std::vector<Place>& places,
+      const std::unordered_map<std::string, PrecisionType>& in_types,
+      const std::unordered_map<std::string, PrecisionType>& out_types,
+      const std::vector<std::string>& in_names,
+      const std::vector<std::string>& out_names) {
     CHECK_GT(places.size(), 0) << "valid_places is empty.";
     float final_score{-1.};
     Place winner_place{places[0]};
@@ -66,7 +74,7 @@ class StaticKernelPickPass : public mir::StmtPass {
     // valid_places.size() as default.
     //         where i is the place's index in valid_places array.
     // score:  score is the weighted sum of target、percision and layout
-    for (int i = 0; i < place_size; ++i) {
+    for (size_t i = 0; i < place_size; ++i) {
       const auto& place = places[i];
       float weight = static_cast<float>(place_size - i) / place_size;
       size_t score{};
@@ -83,8 +91,12 @@ class StaticKernelPickPass : public mir::StmtPass {
           (place.precision == kernel.precision() ||
            kernel.precision() == PRECISION(kAny) ||
            place.precision == PRECISION(kAny))) {
-        score += kMax / static_cast<int>(
-                            core::KernelPickFactor::Factor::PrecisionFirst);
+        // score skipped, if kernel is int8, but op is not int8
+        if (!(kernel.precision() == PRECISION(kInt8) &&
+              !instruct.op_info()->HasAttr("enable_int8"))) {
+          score += kMax / static_cast<int>(
+                              core::KernelPickFactor::Factor::PrecisionFirst);
+        }
       }
       VLOG(4) << "[score s2]:" << score;
       if (kernel_pick_factors_.IsDataLayoutConsidered() &&
@@ -95,6 +107,37 @@ class StaticKernelPickPass : public mir::StmtPass {
                             core::KernelPickFactor::Factor::DataLayoutFirst);
       }
       VLOG(4) << "[score s3]:" << score;
+
+      // add new rules for precision: When the input types are consistent with
+      // kernel's input types  and the output types are consistent with kernel's
+      // output types. Select the kernel of the precision. Note that this
+      // strategy is not compatible with quantization, so skip quantization op.
+      if (!instruct.op_info()->HasAttr("enable_int8")) {
+        bool type_match = true;
+        for (size_t i = 0; i < in_names.size(); ++i) {
+          std::string tmp;
+          CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+          if (in_types.count(in_names[i]) &&
+              in_types.at(in_names[i]) !=
+                  kernel.GetInputDeclType(tmp)->precision()) {
+            type_match = false;
+          }
+        }
+        for (size_t i = 0; i < out_names.size(); ++i) {
+          std::string tmp;
+          CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+          if (out_types.count(out_names[i]) &&
+              out_types.at(out_names[i]) !=
+                  kernel.GetOutputDeclType(tmp)->precision()) {
+            type_match = false;
+          }
+        }
+        if (type_match) {
+          score *= 2;
+        }
+        VLOG(4) << "[score s4]:" << score;
+      }
+
       if (weight * score > final_score) {
         final_score = weight * score;
         winner_place = place;
diff --git a/lite/core/mir/subgraph/CMakeLists.txt b/lite/core/mir/subgraph/CMakeLists.txt
index 95b5fe5ae13e03940bda8d83fcfc252b4ca490ab..f8aa09676c2d1e6d4df6fafbaf6a54bc69491acc 100644
--- a/lite/core/mir/subgraph/CMakeLists.txt
+++ b/lite/core/mir/subgraph/CMakeLists.txt
@@ -1,50 +1,30 @@
-
+lite_cc_library(subgraph_detector
+    SRCS subgraph_detector.cc
+    DEPS mir_pass types subgraph_op)
 lite_cc_library(subgraph_pass
-    SRCS subgraph_program_pass.cc
-    DEPS mir_pass types ${mir_fusers})
-lite_cc_test(test_subgraph_pass SRCS subgraph_program_pass_test.cc
-  DEPS subgraph_pass mir_passes gflags model_parser cxx_api
-  ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
-if (WITH_TESTING)
-  add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v1_tar_gz)
-  add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
-  set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-  set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-endif()
-
-set(subgraph_passes subgraph_pass)
-
-if(LITE_WITH_NPU)
-  lite_cc_library(npu_pass SRCS generate_npu_program_pass.cc
-      DEPS mir_pass types context ${mir_fusers} ${npu_bridges} graph_op subgraph_pass)
-  list(APPEND subgraph_passes npu_pass)
-  lite_cc_test(test_npu_pass SRCS generate_npu_program_pass_test.cc
-    DEPS npu_pass mir_passes paddle_api_full paddle_api_light gflags
-    ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
-         --optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL)
-  if (WITH_TESTING)
-    add_dependencies(test_npu_pass extern_lite_download_mobilenet_v1_tar_gz)
-    add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
+    SRCS subgraph_pass.cc
+    DEPS mir_pass types context ${mir_fusers} subgraph_detector)
+if (WITH_TESTING AND NOT LITE_WITH_CUDA)
+    lite_cc_test(test_subgraph_detector
+        SRCS subgraph_detector_test.cc
+        DEPS subgraph_detector mir_passes gflags model_parser cxx_api
+        ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
+    add_dependencies(test_subgraph_detector
+        extern_lite_download_mobilenet_v1_tar_gz
+        extern_lite_download_mobilenet_v2_relu_tar_gz)
     set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    set_target_properties(test_npu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  endif()
-endif()
-
-if(LITE_WITH_XPU)
-  lite_cc_library(xpu_pass SRCS generate_xpu_program_pass.cc
-      DEPS mir_pass types context ${mir_fusers} ${xpu_bridges} ${xpu_builder_libs} graph_op subgraph_pass)
-  list(APPEND subgraph_passes xpu_pass)
-  lite_cc_test(test_xpu_pass SRCS generate_xpu_program_pass_test.cc
-    DEPS xpu_pass mir_passes paddle_api_full gflags
-    ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
-         --optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL)
-  if (WITH_TESTING)
-    add_dependencies(test_xpu_pass extern_lite_download_mobilenet_v1_tar_gz)
-    add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
+    set_target_properties(test_subgraph_detector PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+    lite_cc_test(test_subgraph_pass
+        SRCS subgraph_pass_test.cc
+        DEPS mir_passes paddle_api_full paddle_api_light gflags
+        ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
+             --optimized_model_dir=${LITE_MODEL_DIR}/lite_model_opt SERIAL)
+    add_dependencies(test_subgraph_pass
+        extern_lite_download_mobilenet_v1_tar_gz
+        extern_lite_download_mobilenet_v2_relu_tar_gz)
     set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    set_target_properties(test_xpu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  endif()
+    set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
 endif()
 
-set(subgraph_passes ${subgraph_passes} CACHE INTERNAL "subgraph_passes")
-message(STATUS "----> subgraph_passes: ${subgraph_passes}")
+set(mir_subgraphs subgraph_pass CACHE INTERNAL "mir_subgraphs")
+message(STATUS "----> mir_subgraphs: ${mir_subgraphs}")
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.cc b/lite/core/mir/subgraph/generate_npu_program_pass.cc
deleted file mode 100644
index c83cd70d8225a0b33a50ebdad331283f377e0059..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/generate_npu_program_pass.cc
+++ /dev/null
@@ -1,219 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/subgraph/generate_npu_program_pass.h"
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher.h"
-
-#include "lite/backends/npu/builder.h"
-#include "lite/kernels/npu/bridges/paddle_use_npu_bridges.h"
-#include "lite/kernels/npu/bridges/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
-    lite::mir::Node* var_node, const Scope* scope) {
-  CHECK(var_node->IsArg());
-  const auto& arg = var_node->AsArg();
-  VLOG(4) << "[NPU] Convert var node " << arg.name;
-
-  auto* var = scope->FindVar(arg.name);
-  CHECK(var);
-  auto* tensor = var->GetMutable<lite::Tensor>();
-  CHECK(tensor);
-  auto dims = tensor->dims();
-  if (arg.is_weight) {
-    auto wgt = std::make_shared<ge::op::Const>(arg.name);
-    LOG(INFO) << "[NPU] Convert const var node " << arg.name;
-    VLOG(4) << dims;
-    wgt->set_attr_value(lite::npu::CvtTensor(tensor));
-    return wgt;
-  } else {
-    CHECK_EQ(dims.size(), 4);
-    LOG(INFO) << "[NPU] Convert data var node " << arg.name;
-    LOG(INFO) << dims;
-    // TODO(xxx): support more types and dims size
-    ge::TensorDesc desc(ge::Shape(dims.Vectorize()),
-                        ge::Format::FORMAT_NCHW,
-                        ge::DataType::DT_FLOAT);
-
-    //   auto size = desc.GetShape().GetShapeSize();
-    //  ge::TensorUtils::SetSize(desc, size*sizeof(float));
-    //  ge::TensorUtils::SetRealDimCnt(desc, 4);
-    auto data = std::make_shared<ge::op::Data>(arg.name);
-    data->update_input_desc_x(desc);
-    return data;
-  }
-  return nullptr;
-}
-
-void GenerateNPUProgramPass::CvtAllOpNodes(
-    const std::vector<Node*>& nodes2cvt,
-    lite::kernels::npu::bridges::node_map_type* converted_vars) {
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& cvtfunc_map = bridges.AllFunctions();
-  // return record all converted vars
-  // op node's inputs must be found in converted_vars
-  for (auto& node : nodes2cvt) {
-    lite::kernels::npu::bridges::node_map_type node_inputs;
-    auto& stmt = node->AsStmt();
-    for (auto& var_node : node->inlinks) {
-      auto& arg = var_node->AsArg();
-      // weight should be handled in the converter, so skip here
-      if (arg.is_weight) {
-        continue;
-      }
-      auto var_name = arg.name;
-      if (!converted_vars->count(var_name)) {
-        converted_vars->insert(
-            std::make_pair(var_name, CvtVarNode(var_node, stmt.op()->scope())));
-      }
-      node_inputs.insert(*converted_vars->find(var_name));
-    }
-    auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs);
-    converted_vars->insert(node_outputs.begin(), node_outputs.end());
-  }
-}
-
-std::string GenerateNPUProgramPass::BuildNPUGraph(
-    const std::unordered_set<Node*>& op_nodes,
-    const std::unordered_set<Node*>& in_data_vars,
-    const std::unordered_set<Node*>& out_data_vars,
-    int sub_id) {
-  auto ordered_nodes = GetTopologicalOrder(op_nodes);
-  lite::kernels::npu::bridges::node_map_type converted_vars;
-  CvtAllOpNodes(ordered_nodes, &converted_vars);
-
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  std::vector<ge::Operator> inputs;
-  std::vector<ge::Operator> outputs;
-  for (auto i : in_data_vars) {
-    auto argname = i->AsArg().name;
-    in_var_names.push_back(argname);
-    inputs.push_back(*converted_vars.at(argname));
-  }
-  for (auto i : out_data_vars) {
-    auto argname = i->AsArg().name;
-    out_var_names.push_back(argname);
-    outputs.push_back(*converted_vars.at(argname));
-  }
-
-  std::string weight_var_name = "graph" + std::to_string(sub_id) + "_weights";
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  auto weight = any_op->scope()->Var(weight_var_name)->GetMutable<Tensor>();
-  weight->set_persistable(true);
-  weight->set_precision(PRECISION(kInt8));
-  // Compiling IR graph to NPU model and store mode data into weight tensor with
-  // persistable=true, Sothat the model parser can recognize it and save it to
-  // param files
-  if (!lite::npu::BuildModel(inputs, outputs, weight)) {
-    LOG(WARNING) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")";
-    throw std::runtime_error("Build NPU graph failed.");
-  }
-  LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")";
-  return weight_var_name;
-}
-
-void GenerateNPUProgramPass::GenNPUSubgraph(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::unordered_set<Node*>& op_nodes,
-    int sub_id) {
-  std::unordered_set<Node*> in_data_vars;
-  std::unordered_set<Node*> in_wgt_vars;
-  std::unordered_set<Node*> out_data_vars;
-  std::unordered_set<Node*> out_unused_vars;
-  FindInputOutputVars(
-      op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
-
-  auto weight_var_name =
-      BuildNPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
-
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  InsertNewNode(graph,
-                weight_var_name,
-                any_op->scope(),
-                any_op->valid_places(),
-                in_data_vars,
-                in_wgt_vars,
-                out_data_vars,
-                out_unused_vars);
-
-  auto nodes2rm = GetNode2rm(
-      op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
-
-  GraphSafeRemoveNodes(graph.get(), nodes2rm);
-}
-
-void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  LOG(INFO) << "[NPU] Before NPU Pass \n" << Visualize(graph.get());
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& op_map = bridges.AllFunctions();
-  std::vector<std::string> supported_op_types;
-  for (auto& i : op_map) {
-    LOG(INFO) << "[NPU] Supported type: " << i.first;
-    supported_op_types.push_back(i.first);
-  }
-
-  try {
-    int num_subgraph = FuseSubgraph(graph, supported_op_types);
-    InferOnce(graph);
-    auto op_nodes_all = ClassifySubgraph(graph);
-    CHECK_EQ(op_nodes_all.size(), num_subgraph);
-    int id = 1;
-    for (auto& op_nodes : op_nodes_all) {
-      LOG(INFO) << "[NPU] Converting Subgraph " << id;
-      GenNPUSubgraph(graph, op_nodes.second, id);
-      LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n"
-                << Visualize(graph.get());
-      id++;
-    }
-  } catch (...) {
-    LOG(WARNING) << "[NPU] Build NPU graph failed.";
-    throw std::runtime_error("[NPU] Build NPU graph failed.");
-  }
-
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (item->IsStmt()) {
-      auto& stmt = item->AsStmt();
-      LOG(INFO) << stmt;
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
-    }
-  }
-}
-
-std::unique_ptr<RuntimeProgram> GenerateNPUProgramPass::GenProgram() {
-  LOG(INFO) << "[NPU] program insts.size " << insts_.size();
-  std::unique_ptr<RuntimeProgram> program(
-      new RuntimeProgram(std::move(insts_)));
-  return program;
-}
-
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(generate_npu_program_pass,
-                  paddle::lite::mir::subgraph::GenerateNPUProgramPass)
-    .BindTargets({TARGET(kNPU)});
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.h b/lite/core/mir/subgraph/generate_npu_program_pass.h
deleted file mode 100644
index 823ca5f1f624a9e920a5f395a9d5098c5ea52929..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/generate_npu_program_pass.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "lite/backends/npu/builder.h"
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
-#include "lite/kernels/npu/bridges/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-class GenerateNPUProgramPass : public SubgraphProgramPass {
- public:
-  using key2nodes_t = std::map<std::string, Node*>;
-
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-  std::unique_ptr<RuntimeProgram> GenProgram();
-
- protected:
-  // nodes2cvt: op nodes to convert
-  // return cvted_vars: converted var nodes
-  void CvtAllOpNodes(const std::vector<Node*>& nodes2cvt,
-                     lite::kernels::npu::bridges::node_map_type* cvted_vars);
-
-  std::shared_ptr<ge::Operator> CvtVarNode(lite::mir::Node* var_node,
-                                           const Scope* scope);
-
-  std::string BuildNPUGraph(const std::unordered_set<Node*>& op_nodes,
-                            const std::unordered_set<Node*>& in_data_vars,
-                            const std::unordered_set<Node*>& out_data_vars,
-                            int sub_id);
-
-  void GenNPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
-                      const std::unordered_set<Node*>& op_nodes,
-                      int sub_id);
-
- private:
-  std::vector<Instruction> insts_;
-};
-
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
deleted file mode 100644
index 95339d6175c98f22d542db24f02d6d714ccbe2a8..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <cmath>
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/utils/cp_logging.h"
-
-DEFINE_string(model_file, "", "model file path of combined protobuf model");
-DEFINE_string(params_file, "", "params file path of combined protobuf model");
-DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model");
-DEFINE_string(input_tensor_shape, "1,3,224,224", "shapes of input tensors");
-DEFINE_int32(output_tensor_num, 1, "number of output tensors");
-
-namespace paddle {
-namespace lite {
-
-std::vector<std::vector<int64_t>> ParseShape(std::string txt) {
-  std::vector<std::vector<int64_t>> shape;
-  while (!txt.empty()) {
-    size_t idx = txt.find_first_of(":");
-    std::string dims = txt.substr(0, idx);
-    std::vector<int64_t> s;
-    while (!dims.empty()) {
-      size_t idx = dims.find_first_of(",");
-      int d = atoi(dims.substr(0, idx).c_str());
-      VLOG(3) << d;
-      s.push_back(d);
-      if (idx == std::string::npos) {
-        break;
-      } else {
-        dims = dims.substr(idx + 1);
-      }
-    }
-    shape.push_back(s);
-    if (idx == std::string::npos) {
-      break;
-    } else {
-      txt = txt.substr(idx + 1);
-    }
-  }
-  return shape;
-}
-
-int64_t ShapeProduction(std::vector<int64_t> shape) {
-  int64_t s = 1;
-  for (int64_t dim : shape) {
-    s *= dim;
-  }
-  return s;
-}
-
-void FillInputTensor(
-    const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
-    const std::vector<std::vector<int64_t>>& input_tensor_shape,
-    const float value) {
-  for (int i = 0; i < input_tensor_shape.size(); i++) {
-    auto input_tensor = predictor->GetInput(i);
-    input_tensor->Resize(input_tensor_shape[i]);
-    auto input_tensor_data = input_tensor->mutable_data<float>();
-    auto input_tensor_size = ShapeProduction(input_tensor->shape());
-    for (int j = 0; j < input_tensor_size; j++) {
-      input_tensor_data[i] = value;
-    }
-  }
-}
-
-void CompareOutputTensor(
-    const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor,
-    const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor,
-    const int output_tensor_num) {
-  for (int i = 0; i < output_tensor_num; i++) {
-    auto tar_output_tensor = tar_predictor->GetOutput(i);
-    auto ref_output_tensor = ref_predictor->GetOutput(i);
-    auto tar_output_tensor_data = tar_output_tensor->data<float>();
-    auto ref_output_tensor_data = ref_output_tensor->data<float>();
-    auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());
-    auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape());
-    EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size);
-    for (size_t j = 0; j < ref_output_tensor_size; j++) {
-      auto abs_diff =
-          std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]);
-      auto rel_diff = abs_diff / (std::fabs(ref_output_tensor_data[j]) + 1e-6);
-      VLOG(3) << "val: " << tar_output_tensor_data[j]
-              << " ref: " << ref_output_tensor_data[j]
-              << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff;
-      EXPECT_LT(rel_diff, 0.1);
-    }
-  }
-}
-
-std::shared_ptr<lite_api::PaddlePredictor> TestModel(
-    const std::string& model_dir,
-    const std::string& model_file,
-    const std::string& params_file,
-    const std::vector<lite_api::Place>& valid_places,
-    const std::vector<std::vector<int64_t>>& input_tensor_shape,
-    const std::string& optimized_model_dir) {
-  // generate optimized model
-  lite_api::CxxConfig cxx_config;
-  cxx_config.set_model_dir(model_dir);
-  cxx_config.set_model_file(model_file);
-  cxx_config.set_param_file(params_file);
-  cxx_config.set_valid_places(valid_places);
-  auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
-  FillInputTensor(predictor, input_tensor_shape, 1);
-  predictor->SaveOptimizedModel(optimized_model_dir,
-                                lite_api::LiteModelType::kNaiveBuffer);
-  // load optimized model
-  lite_api::MobileConfig mobile_config;
-  mobile_config.set_model_dir(optimized_model_dir);
-  mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
-  mobile_config.set_threads(1);
-  predictor = lite_api::CreatePaddlePredictor(mobile_config);
-  FillInputTensor(predictor, input_tensor_shape, 1);
-  // run optimized model
-  for (int i = 0; i < FLAGS_warmup; i++) {
-    predictor->Run();
-  }
-  for (int i = 0; i < FLAGS_repeats; i++) {
-    auto start = GetCurrentUS();
-    predictor->Run();
-    LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
-  }
-  return predictor;
-}
-
-TEST(NPUSubgraph, compare) {
-  // parsing input tensor shape, supported formats: "1,3,224,224"
-  // "1,3,224,224:1,80"
-  std::vector<std::vector<int64_t>> input_tensor_shape =
-      ParseShape(FLAGS_input_tensor_shape);
-  // generate and run optimized CPU model
-  LOG(INFO) << " ================ CPU ================== ";
-  auto cpu_predictor =
-      TestModel(FLAGS_model_dir,
-                FLAGS_model_file,
-                FLAGS_params_file,
-                {lite_api::Place{TARGET(kARM), PRECISION(kFloat)}},
-                input_tensor_shape,
-                FLAGS_optimized_model_dir + "/CPU");
-  // generate and run optimized NPU model
-  LOG(INFO) << " ================ NPU ================== ";
-  auto npu_predictor =
-      TestModel(FLAGS_model_dir,
-                FLAGS_model_file,
-                FLAGS_params_file,
-                {lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
-                 lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}},
-                input_tensor_shape,
-                FLAGS_optimized_model_dir + "/NPU");
-  // verify results
-  CompareOutputTensor(npu_predictor, cpu_predictor, FLAGS_output_tensor_num);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.cc b/lite/core/mir/subgraph/generate_xpu_program_pass.cc
deleted file mode 100644
index 319e1e51feb917b803753807ddbb1f72c2cb7084..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher.h"
-
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-std::shared_ptr<xtcl::xExpr> GenerateXPUProgramPass::CvtVarNode(
-    lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
-    lite::mir::Node* var_node,
-    const Scope* scope) {
-  CHECK(var_node->IsArg());
-  const auto& arg = var_node->AsArg();
-  auto var_name = arg.name;
-  VLOG(4) << "[XPU] Convert var node " << var_name;
-
-  auto* var = scope->FindVar(var_name);
-  CHECK(var);
-  auto* tensor = var->GetMutable<lite::Tensor>();
-  CHECK(tensor);
-  auto dims = tensor->dims();
-  auto cvted_var_node =
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-          var_name, lite::xpu::CvtShape(dims), ::xtcl::Float(32)));
-  if (arg.is_weight) {
-    auto cvted_var_tensor = lite::xpu::CvtTensor(tensor);
-    graph_ctx->params->emplace(std::make_pair(var_name, *cvted_var_tensor));
-  }
-  return cvted_var_node;
-}
-
-void GenerateXPUProgramPass::CvtAllOpNodes(
-    const std::vector<Node*>& op_nodes,
-    lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
-    lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes) {
-  const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  // return record all converted vars
-  // op node's inputs must be found in converted_vars
-  for (auto& node : op_nodes) {
-    lite::kernels::xpu::bridges::node_map_type input_nodes;
-    auto& stmt = node->AsStmt();
-    for (auto& var_node : node->inlinks) {
-      auto& arg = var_node->AsArg();
-      // weight should be handled in the converter, so skip here
-      if (arg.is_weight) {
-        continue;
-      }
-      auto var_name = arg.name;
-      if (!cvted_var_nodes->count(var_name)) {
-        cvted_var_nodes->insert(std::make_pair(
-            var_name, CvtVarNode(graph_ctx, var_node, stmt.op()->scope())));
-      }
-      input_nodes.insert(*cvted_var_nodes->find(var_name));
-    }
-    auto output_nodes =
-        supported_lists.at(stmt.op_type())(stmt.op(), graph_ctx, input_nodes);
-    cvted_var_nodes->insert(output_nodes.begin(), output_nodes.end());
-  }
-}
-
-std::string GenerateXPUProgramPass::BuildXPUGraph(
-    const std::unordered_set<Node*>& op_nodes,
-    const std::unordered_set<Node*>& in_data_vars,
-    const std::unordered_set<Node*>& out_data_vars,
-    int sub_id) {
-  auto ordered_op_nodes = GetTopologicalOrder(op_nodes);
-  lite::kernels::xpu::bridges::graph_ctx_type graph_ctx;
-  graph_ctx.builder = std::make_shared<xtcl::network::xNetworkBuilder>();
-  graph_ctx.params =
-      std::make_shared<xtcl::network::xTensorCompiler::ParamNDArrayMap>();
-  lite::kernels::xpu::bridges::node_map_type cvted_var_nodes;
-  CvtAllOpNodes(ordered_op_nodes, &graph_ctx, &cvted_var_nodes);
-
-  std::string weight_var_name = "graph" + std::to_string(sub_id) + "_weights";
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  auto weight = any_op->scope()->Var(weight_var_name)->GetMutable<Tensor>();
-  weight->set_persistable(true);
-  weight->set_precision(PRECISION(kInt8));
-  // Compiling graph to XPU model and store mode data into weight tensor with
-  // persistable=true, Sothat the model parser can recognize it and save it to
-  // param files
-  std::vector<std::shared_ptr<xtcl::xExpr>> ordered_cvted_var_nodes;
-  for (auto out_data_var : out_data_vars) {
-    auto var_name = out_data_var->AsArg().name;
-    ordered_cvted_var_nodes.push_back(cvted_var_nodes[var_name]);
-  }
-  if (!lite::xpu::BuildModel(graph_ctx.builder,
-                             graph_ctx.params,
-                             &ordered_cvted_var_nodes,
-                             weight)) {
-    LOG(WARNING) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")";
-    throw std::runtime_error("[XPU] Build XPU graph failed.");
-  }
-  LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")";
-  return weight_var_name;
-}
-
-void GenerateXPUProgramPass::GenXPUSubgraph(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::unordered_set<Node*>& op_nodes,
-    int sub_id) {
-  std::unordered_set<Node*> in_data_vars;
-  std::unordered_set<Node*> in_wgt_vars;
-  std::unordered_set<Node*> out_data_vars;
-  std::unordered_set<Node*> out_unused_vars;
-  FindInputOutputVars(
-      op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
-
-  auto weight_var_name =
-      BuildXPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
-
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  InsertNewNode(graph,
-                weight_var_name,
-                any_op->scope(),
-                any_op->valid_places(),
-                in_data_vars,
-                in_wgt_vars,
-                out_data_vars,
-                out_unused_vars);
-
-  auto nodes2rm = GetNode2rm(
-      op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
-
-  GraphSafeRemoveNodes(graph.get(), nodes2rm);
-}
-
-void GenerateXPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  LOG(INFO) << "[XPU] Before XPU Pass \n" << Visualize(graph.get());
-  const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
-  const auto& op_map = bridges.AllFunctions();
-  std::vector<std::string> supported_op_types;
-  for (auto& i : op_map) {
-    LOG(INFO) << "[XPU] Supported type: " << i.first;
-    supported_op_types.push_back(i.first);
-  }
-
-  try {
-    int num_subgraph = FuseSubgraph(graph, supported_op_types);
-    InferOnce(graph);
-    auto op_nodes_all = ClassifySubgraph(graph);
-    CHECK_EQ(op_nodes_all.size(), num_subgraph);
-    int id = 1;
-    for (auto& op_nodes : op_nodes_all) {
-      LOG(INFO) << "[XPU] Converting Subgraph " << id;
-      GenXPUSubgraph(graph, op_nodes.second, id);
-      LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n"
-                << Visualize(graph.get());
-      id++;
-    }
-  } catch (...) {
-    LOG(WARNING) << "[XPU] Build XPU graph failed.";
-    throw std::runtime_error("[XPU] Build XPU graph failed.");
-  }
-
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (item->IsStmt()) {
-      auto& stmt = item->AsStmt();
-      LOG(INFO) << stmt;
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
-    }
-  }
-}
-
-std::unique_ptr<RuntimeProgram> GenerateXPUProgramPass::GenProgram() {
-  LOG(INFO) << "[XPU] program insts.size=" << insts_.size();
-  std::unique_ptr<RuntimeProgram> program(
-      new RuntimeProgram(std::move(insts_)));
-  return program;
-}
-
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(generate_xpu_program_pass,
-                  paddle::lite::mir::subgraph::GenerateXPUProgramPass)
-    .BindTargets({TARGET(kXPU)});
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.h b/lite/core/mir/subgraph/generate_xpu_program_pass.h
deleted file mode 100644
index cf121ae9503201e8cf6be40fe9054ccaf6e4b172..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "lite/backends/xpu/builder.h"
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-class GenerateXPUProgramPass : public SubgraphProgramPass {
- public:
-  using key2nodes_t = std::map<std::string, Node*>;
-
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-  std::unique_ptr<RuntimeProgram> GenProgram();
-
- protected:
-  // nodes2cvt: op nodes to convert
-  // return cvted_vars: converted var nodes
-  void CvtAllOpNodes(
-      const std::vector<Node*>& op_nodes,
-      lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
-      lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes);
-
-  std::shared_ptr<xtcl::xExpr> CvtVarNode(
-      lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
-      lite::mir::Node* var_node,
-      const Scope* scope);
-
-  std::string BuildXPUGraph(const std::unordered_set<Node*>& op_nodes,
-                            const std::unordered_set<Node*>& in_data_vars,
-                            const std::unordered_set<Node*>& out_data_vars,
-                            int sub_id);
-
-  void GenXPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
-                      const std::unordered_set<Node*>& op_nodes,
-                      int sub_id);
-
- private:
-  std::vector<Instruction> insts_;
-};
-
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc b/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc
deleted file mode 100644
index 728ecbc6b77666accd432b1ad82a03860588ab40..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <cmath>
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/utils/cp_logging.h"
-
-DEFINE_string(model_file, "", "model file path of combined protobuf model");
-DEFINE_string(params_file, "", "params file path of combined protobuf model");
-DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model");
-DEFINE_string(input_tensor_shape, "1,3,224,224", "shapes of input tensors");
-DEFINE_int32(output_tensor_num, 1, "number of output tensors");
-
-namespace paddle {
-namespace lite {
-
-std::vector<std::vector<int64_t>> ParseShape(std::string txt) {
-  std::vector<std::vector<int64_t>> shape;
-  while (!txt.empty()) {
-    size_t idx = txt.find_first_of(":");
-    std::string dims = txt.substr(0, idx);
-    std::vector<int64_t> s;
-    while (!dims.empty()) {
-      size_t idx = dims.find_first_of(",");
-      int d = atoi(dims.substr(0, idx).c_str());
-      VLOG(3) << d;
-      s.push_back(d);
-      if (idx == std::string::npos) {
-        break;
-      } else {
-        dims = dims.substr(idx + 1);
-      }
-    }
-    shape.push_back(s);
-    if (idx == std::string::npos) {
-      break;
-    } else {
-      txt = txt.substr(idx + 1);
-    }
-  }
-  return shape;
-}
-
-int64_t ShapeProduction(std::vector<int64_t> shape) {
-  int64_t s = 1;
-  for (int64_t dim : shape) {
-    s *= dim;
-  }
-  return s;
-}
-
-void FillInputTensor(
-    const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
-    const std::vector<std::vector<int64_t>>& input_tensor_shape,
-    const float value) {
-  for (int i = 0; i < input_tensor_shape.size(); i++) {
-    auto input_tensor = predictor->GetInput(i);
-    input_tensor->Resize(input_tensor_shape[i]);
-    auto input_tensor_data = input_tensor->mutable_data<float>();
-    auto input_tensor_size = ShapeProduction(input_tensor->shape());
-    for (int j = 0; j < input_tensor_size; j++) {
-      input_tensor_data[j] = value;
-    }
-  }
-}
-
-void CompareOutputTensor(
-    const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor,
-    const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor,
-    const int output_tensor_num) {
-  for (int i = 0; i < output_tensor_num; i++) {
-    auto tar_output_tensor = tar_predictor->GetOutput(i);
-    auto ref_output_tensor = ref_predictor->GetOutput(i);
-    auto tar_output_tensor_data = tar_output_tensor->data<float>();
-    auto ref_output_tensor_data = ref_output_tensor->data<float>();
-    auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());
-    auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape());
-    EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size);
-    for (size_t j = 0; j < ref_output_tensor_size; j++) {
-      auto diff =
-          std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]) /
-          (std::fabs(ref_output_tensor_data[j]) + 1e-6);
-      VLOG(3) << diff;
-      EXPECT_LT(diff, 0.1);
-    }
-  }
-}
-
-std::shared_ptr<lite_api::PaddlePredictor> TestModel(
-    const std::string& model_dir,
-    const std::string& model_file,
-    const std::string& params_file,
-    const std::vector<lite_api::Place>& valid_places,
-    const std::vector<std::vector<int64_t>>& input_tensor_shape,
-    const std::string& optimized_model_dir) {
-  // generate optimized model
-  lite_api::CxxConfig cxx_config;
-  cxx_config.set_model_dir(model_dir);
-  cxx_config.set_model_file(model_file);
-  cxx_config.set_param_file(params_file);
-  cxx_config.set_valid_places(valid_places);
-  auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
-  FillInputTensor(predictor, input_tensor_shape, -1);
-  predictor->SaveOptimizedModel(optimized_model_dir,
-                                lite_api::LiteModelType::kNaiveBuffer);
-#if 0  // TODO(hong19860320) supports light api for XPU
-  // load optimized model
-  lite_api::MobileConfig mobile_config;
-  mobile_config.set_model_dir(optimized_model_dir);
-  mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
-  mobile_config.set_threads(1);
-  predictor = lite_api::CreatePaddlePredictor(mobile_config);
-  FillInputTensor(predictor, input_tensor_shape, 1);
-#endif
-  // run optimized model
-  for (int i = 0; i < FLAGS_warmup; i++) {
-    predictor->Run();
-  }
-  for (int i = 0; i < FLAGS_repeats; i++) {
-    auto start = GetCurrentUS();
-    predictor->Run();
-    LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
-  }
-  return predictor;
-}
-
-TEST(XPUSubgraph, compare) {
-  // parsing input tensor shape, supported formats: "1,3,224,224"
-  // "1,3,224,224:1,80"
-  std::vector<std::vector<int64_t>> input_tensor_shape =
-      ParseShape(FLAGS_input_tensor_shape);
-  // generate and run optimized CPU model
-  LOG(INFO) << " ================ CPU ================== ";
-  auto cpu_predictor =
-      TestModel(FLAGS_model_dir,
-                FLAGS_model_file,
-                FLAGS_params_file,
-                {lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
-                input_tensor_shape,
-                FLAGS_optimized_model_dir + "/CPU");
-  // generate and run optimized XPU model
-  LOG(INFO) << " ================ XPU ================== ";
-  auto xpu_predictor =
-      TestModel(FLAGS_model_dir,
-                FLAGS_model_file,
-                FLAGS_params_file,
-                {lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
-                 lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
-                input_tensor_shape,
-                FLAGS_optimized_model_dir + "/XPU");
-  // verify results
-  CompareOutputTensor(xpu_predictor, cpu_predictor, FLAGS_output_tensor_num);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d48b053a1a4140252d35e85d2351644d3c216e9
--- /dev/null
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -0,0 +1,551 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/subgraph/subgraph_detector.h"
+#include <memory>
+#include <set>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/dot.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher.h"
+#include "lite/operators/subgraph_op.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+using inference::analysis::Dot;
+
+std::string SubgraphVisualizer::operator()() {
+  inference::analysis::Dot dot;
+  const std::vector<std::string> subgraph_colors{
+      "red",          "green",          "cyan",           "bisque3",
+      "coral",        "darkseagreen1",  "goldenrod1",     "darkorchid",
+      "antiquewhite", "aquamarine",     "azure",          "bisque4",
+      "blue2",        "brown1",         "burlywood1",     "cadetblue1",
+      "chartreuse1",  "chocolate1",     "coral1",         "cornsilk",
+      "crimson",      "cyan4",          "darkgoldenrod4", "darkolivegreen2",
+      "darkorange2",  "darkorchid2",    "darkseagreen3",  "darkslategray",
+      "deeppink2",    "deepskyblue2",   "dodgerblue",     "firebrick",
+      "floralwhite",  "gold1",          "skyblue3",       "indianred",
+      "indigo",       "lavenderblush2", "lightblue1",     "lightsalmon3",
+      "khaki1",       "ivory4",         "sandybrown",     "olivedrab2",
+      "turquoise4",   "snow3",          "sienna4",        "salmon2",
+  };
+  std::unordered_map<Node *, int> subgraph_indices;
+  for (int i = 0; i < subgraphs_.size(); i++) {
+    for (int j = 0; j < subgraphs_[i].size(); j++) {
+      subgraph_indices[subgraphs_[i][j]] = i;
+    }
+  }
+  std::unordered_map<std::string, int> exists_ops;
+  std::set<std::string> exists_args;
+  for (auto &node : graph_->StmtTopologicalOrder()) {
+    if (!node->IsStmt()) {
+      continue;
+    }
+    auto op_type = node->AsStmt().op_type();
+    if (!exists_ops.count(op_type)) {
+      exists_ops[op_type] = 0;
+    } else {
+      exists_ops[op_type]++;
+    }
+    auto op_name = op_type + std::to_string(exists_ops[op_type]);
+    std::string op_color = "white";
+    if (subgraph_indices.count(node)) {
+      auto subgraph_idx = subgraph_indices[node];
+      op_name += "_subgraph_" + std::to_string(subgraph_idx);
+      op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()];
+    }
+    dot.AddNode(op_name,
+                {Dot::Attr("shape", "box"),
+                 Dot::Attr("style", "filled"),
+                 Dot::Attr("color", "black"),
+                 Dot::Attr("fillcolor", op_color)});
+    for (auto &in_node : node->inlinks) {
+      auto arg_name = in_node->AsArg().name;
+      if (!exists_args.count(arg_name)) {
+        dot.AddNode(arg_name, {});
+        exists_args.insert(arg_name);
+      }
+      dot.AddEdge(arg_name, op_name, {});
+    }
+    for (auto &out_node : node->outlinks) {
+      auto arg_name = out_node->AsArg().name;
+      if (!exists_args.count(arg_name)) {
+        dot.AddNode(arg_name, {});
+        exists_args.insert(arg_name);
+      }
+      dot.AddEdge(op_name, arg_name, {});
+    }
+  }
+
+  auto res = dot.Build();
+  std::cout << "subgraphs: " << subgraphs_.size() << "\n" << res << std::endl;
+  return res;
+}
+
+// Find the ancestor node
+SubgraphDetector::node_dat_t *
+SubgraphDetector::node_dat_t::UnionFindAncestor() {
+  node_dat_t *ancestor = this;
+  while (ancestor->union_find_parent != ancestor) {
+    ancestor = ancestor->union_find_parent;
+  }
+  return ancestor;
+}
+
+// Merge the two adjacent nodes into one node.
+// Suppose we have two adjacent nodes src and dst.
+// We will perform the following operations:
+// 1. add all inputs(except src) of dst to src inlinks.
+// 2. add all outputs of dst to src outlinks.
+// 3. change all the dst's inputs and outputs
+// corresponding inlinks and outlinks to src node.
+// 4. delete all dst's inlinks and outlinks.
+void SubgraphDetector::node_dat_t::UnionFindCombine(node_dat_t *candidate) {
+  // Make this two node share the same ancestor.
+  union_find_parent = UnionFindAncestor();
+  node_dat_t *candidate_ancestor = candidate->UnionFindAncestor();
+  candidate_ancestor->union_find_parent = union_find_parent;
+  candidate->union_find_parent = union_find_parent;
+
+  // Obtain the input and output nodes for the combined one
+  std::unordered_set<node_dat_t *> inputs(inlinks.begin(), inlinks.end());
+  std::unordered_set<node_dat_t *> outputs(candidate->outlinks.begin(),
+                                           candidate->outlinks.end());
+  for (auto *out_node : outlinks) {
+    if (out_node != candidate) {
+      outputs.insert(out_node);
+    }
+  }
+  for (auto *in_node : candidate->inlinks) {
+    if (in_node != this) {
+      inputs.insert(in_node);
+    }
+  }
+
+// Update the dst and src node's inlinks and outlinks.
+#ifdef __clang__
+  inlinks = node_set_t(inputs.begin(), inputs.end());
+  outlinks = node_set_t(outputs.begin(), outputs.end());
+  candidate->inlinks.clear();
+  candidate->outlinks.clear();
+#else
+  inlinks = std::move(node_set_t(inputs.begin(), inputs.end()));
+  outlinks = std::move(node_set_t(outputs.begin(), outputs.end()));
+  candidate->inlinks.clear();
+  candidate->outlinks.clear();
+#endif
+
+  // Change all the dst inputs and outputs corresponding inlink and
+  // outlink to the src node.
+  for (auto *in_node : inlinks) {
+    for (auto *&out_node : in_node->outlinks) {
+      if (out_node == candidate) {
+        out_node = this;
+      }
+    }
+  }
+  for (auto *out_node : outlinks) {
+    for (auto *&in_node : out_node->inlinks) {
+      if (in_node == candidate) {
+        in_node = this;
+      }
+    }
+  }
+}
+
+// FlexibleDFS
+// If reverse is true, do reverse dfs.
+// If enter func is not nullptr, calls enter(node) before visiting any children
+// of node.
+// If leave func not nullptr, calls leave(node) after visiting all parents of
+// node.
+void SubgraphDetector::FlexibleDFS(
+    const node_set_t &source,
+    bool reverse,
+    const std::function<bool(const node_dat_t *)> &enter,
+    const std::function<bool(const node_dat_t *)> &leave) {
+  std::vector<std::pair<const node_dat_t *, bool>> stack;  // node, leave
+  for (auto &node : source) {
+    stack.push_back(std::pair<const node_dat_t *, bool>(node, false));
+  }
+  std::unordered_set<const node_dat_t *> visited;
+  while (!stack.empty()) {
+    auto top = stack.back();
+    stack.pop_back();
+
+    if (top.second) {
+      if (leave && !leave(top.first)) return;
+    }
+    if (visited.count(top.first)) continue;
+    visited.insert(top.first);
+
+    if (enter && !enter(top.first)) return;
+
+    if (leave)
+      stack.push_back(std::pair<const node_dat_t *, bool>(top.first, true));
+    const node_set_t iter_nodes =
+        reverse == true ? top.first->inlinks : top.first->outlinks;
+    for (auto *node : iter_nodes) {
+      if (!visited.count(node)) {
+        stack.push_back(std::pair<const node_dat_t *, bool>(node, false));
+      }
+    }
+  }
+}
+
+void SubgraphDetector::InitNodes(node_map_t *nodes) {
+  // Initialize and mark the subgraph detector nodes based on teller.
+  for (auto &it : *nodes) {
+    for (auto &in_node : it.first->inlinks) {
+      it.second->inlinks.push_back((*nodes)[in_node]);
+    }
+    for (auto &out_node : it.first->outlinks) {
+      it.second->outlinks.push_back((*nodes)[out_node]);
+    }
+    if (teller_(it.first)) {
+      it.second->marked = true;
+      if (it.first->IsStmt()) {
+        // If a function is inside the subgraph, mark all the output variables
+        // to be inside too, so that two marked functions will be inside a same
+        // subgraph, lets take a example:  A_function->var->B_function, if
+        // A_function is marked, var should also be marked, so that B_function
+        // will be in the same subgraph with A_function if B_function is
+        // marked.
+        for (auto &out_node : it.first->outlinks) {
+          (*nodes)[out_node]->marked = true;
+        }
+      }
+    }
+  }
+}  // namespace mir
+
+std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
+    node_map_t *nodes) {
+  for (auto &it : *nodes) {
+    node_dat_t *node = it.second;
+    if (!node->marked) {
+      continue;
+    }
+    //  Our algorithm must guarantee that:
+    //  1. The graph is always directed acyclic graph（DAG）.
+    //  2. If there is a path in the subgraph from X to Y (X and Y are both
+    //  nodes in the subgraph), then all paths from X to Y are in the
+    //  subgraph.
+    //
+    //  In order to achieve the above guarantee.
+    //  For adjacent nodes src -> dst.
+    //  1. Get all dst input nodes except src.
+    //  2. Reverse DFS from those input nodes
+    //  3. If there is a path from input nodes to src,
+    //  then the src and dst nodes can not be fused into one node,
+    //  otherwise it can be done.
+    while (true) {
+      std::unordered_set<node_dat_t *> contract_nodes;
+      for (auto *out_node : node->outlinks) {
+        // must be an candidate
+        if (!out_node->marked) continue;
+        // get all dst input nodes except src node.
+        node_set_t source_nodes;
+        for (auto *in_node : out_node->inlinks) {
+          if (in_node != node) {
+            source_nodes.push_back(in_node);
+          }
+        }
+
+        // Reverse DFS from the source_nodes.
+        bool have_excess_path = false;
+        FlexibleDFS(source_nodes,
+                    true,
+                    nullptr,
+                    [&have_excess_path, node](const node_dat_t *n) {
+                      if (n == node) {
+                        have_excess_path = true;
+                        return false;
+                      }
+                      return true;
+                    });
+        if (have_excess_path) continue;
+        contract_nodes.insert(out_node);
+      }
+      if (contract_nodes.empty()) break;
+
+      for (auto &contract_node : contract_nodes) {
+        node->UnionFindCombine(contract_node);
+      }
+    }
+  }
+
+  std::unordered_map<node_dat_t * /*ancestor*/, std::vector<Node *>> clusters;
+  for (auto &node : graph_->StmtTopologicalOrder()) {
+    if (!node->IsStmt()) continue;
+    if ((*nodes)[node]->marked) {
+      clusters[(*nodes)[node]->UnionFindAncestor()].push_back(node);
+    }
+  }
+  std::vector<std::vector<Node *>> subgraphs;
+  std::for_each(clusters.begin(),
+                clusters.end(),
+                [&](const decltype(clusters)::value_type &it) {
+                  subgraphs.push_back(it.second);
+                });
+  return subgraphs;
+}
+
+std::vector<std::vector<Node *>> SubgraphDetector::operator()() {
+  node_map_t nodes;
+  for (auto &node : graph_->mutable_nodes()) {
+    nodes[&node] = new node_dat_t(&node);
+    CHECK(nodes[&node]);
+  }
+  // Initialize and mark the subgraph detector nodes based on teller.
+  InitNodes(&nodes);
+  // Run the Extract algorithm to find all subgraphs.
+  std::vector<std::vector<Node *>> subgraphs = ExtractSubgraphs(&nodes);
+  for (auto &it : nodes) {
+    CHECK(it.second);
+    delete it.second;
+  }
+  return subgraphs;
+}
+
+void SubgraphFuser::InsertNewNode(SSAGraph *graph,
+                                  int subgraph_idx,
+                                  const std::vector<Node *> &subgraph_nodes) {
+  // Create and attach a new subgraph op
+  cpp::OpDesc subgraph_op_desc;
+  subgraph_op_desc.SetType("subgraph");
+
+  // Create a new sub block desc for storing all of Ops an Vars of the target
+  // subgraph and sub_block_idx is set as a attribute of subgraph op,
+  // sub_block_idx < 0 means it's a new subgraph op
+  int sub_block_idx = -(subgraph_idx + 1);
+  auto sub_block_desc = new cpp::BlockDesc();
+  sub_block_desc->ClearOps();
+  sub_block_desc->ClearVars();
+  for (auto &op_node : subgraph_nodes) {
+    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_block_op_desc = *op_node->AsStmt().op_info();
+    sub_block_op_desc->SetAttr(
+        kKernelTypeAttr,
+        op_node->AsStmt().picked_kernel().SerializedKernelType());
+  }
+  subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);
+
+  // Extract input and output nodes from the target subgraph
+  std::unordered_set<Node *> input_var_nodes;
+  std::unordered_set<Node *> weight_var_nodes;
+  std::unordered_set<Node *> output_var_nodes;
+  std::unordered_set<Node *> local_var_nodes;
+  std::unordered_set<Node *> unused_var_nodes;
+  ExtractInputsOutputs(subgraph_nodes,
+                       &input_var_nodes,
+                       &weight_var_nodes,
+                       &output_var_nodes,
+                       &local_var_nodes,
+                       &unused_var_nodes);
+
+  // Set input and output name mapping which stores the real inputs and
+  // outputs
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (auto &var_node : input_var_nodes) {
+    input_var_names.push_back(var_node->AsArg().name);
+  }
+  for (auto &var_node : output_var_nodes) {
+    output_var_names.push_back(var_node->AsArg().name);
+  }
+  subgraph_op_desc.SetAttr<std::vector<std::string>>("input_data_names",
+                                                     input_var_names);
+  subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
+                                                     output_var_names);
+
+  // Set all of the inputs and outputs to the target subgraph op
+  // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
+  for (auto &var_node : weight_var_nodes) {
+    input_var_names.push_back(var_node->AsArg().name);
+  }
+  for (auto &var_node : local_var_nodes) {
+    output_var_names.push_back(var_node->AsArg().name);
+  }
+  for (auto &var_node : unused_var_nodes) {
+    output_var_names.push_back(var_node->AsArg().name);
+  }
+  subgraph_op_desc.SetInput("Inputs", input_var_names);
+  subgraph_op_desc.SetOutput("Outputs", output_var_names);
+  auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+  static_cast<operators::SubgraphOp *>(subgraph_op.get())
+      ->SetSubBlock(sub_block_desc);
+  auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
+  subgraph_op->Attach(subgraph_op_desc, any_op->scope());
+
+  // Create and add a new subgraph node into the graph
+  auto subgraph_op_node =
+      graph->GraphCreateInstructNode(subgraph_op, any_op->valid_places());
+  for (auto &var_node : input_var_nodes) {
+    IR_NODE_LINK_TO(var_node, subgraph_op_node);
+  }
+  for (auto &var_node : weight_var_nodes) {
+    IR_NODE_LINK_TO(var_node, subgraph_op_node);
+  }
+  for (auto &var_node : output_var_nodes) {
+    IR_OP_VAR_LINK(subgraph_op_node, var_node);
+  }
+  for (auto &var_node : local_var_nodes) {
+    IR_OP_VAR_LINK(subgraph_op_node, var_node);
+  }
+  for (auto &var_node : unused_var_nodes) {
+    IR_OP_VAR_LINK(subgraph_op_node, var_node);
+  }
+
+  // Create and assign the context to the picked kernel of the new subgraph
+  // node
+  auto &inst = subgraph_op_node->AsStmt();
+  inst.picked_kernel().SetContext(
+      ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
+
+  // Remove subgraph nodes and unused var nodes
+  auto nodes2rm = GetNodes2RM(subgraph_nodes,
+                              {input_var_nodes,
+                               weight_var_nodes,
+                               output_var_nodes,
+                               local_var_nodes,
+                               unused_var_nodes});
+  GraphSafeRemoveNodes(graph, nodes2rm);
+}
+
+void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph,
+                                              const SubgraphTeller &teller,
+                                              int min_subgraph_size) {
+  std::vector<std::vector<Node *>> subgraphs =
+      SubgraphDetector(graph, teller)();
+  SubgraphVisualizer(graph, subgraphs)();
+  for (int subgraph_idx = 0; subgraph_idx < subgraphs.size(); subgraph_idx++) {
+    if (subgraphs[subgraph_idx].size() >= min_subgraph_size) {
+      InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]);
+    }
+  }
+}
+
+void SubgraphFuser::operator()() {
+  ReplaceNodesWithSubgraphs(graph_, teller_, min_subgraph_size_);
+}
+
+void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,
+                          std::unordered_set<Node *> *input_var_nodes,
+                          std::unordered_set<Node *> *weight_var_nodes,
+                          std::unordered_set<Node *> *output_var_nodes,
+                          std::unordered_set<Node *> *local_var_nodes,
+                          std::unordered_set<Node *> *unused_var_nodes) {
+  for (auto &op_node : op_nodes) {
+    for (auto &var_node : op_node->inlinks) {
+      if (var_node->AsArg().is_weight) {
+        weight_var_nodes->insert(var_node);
+        continue;
+      }
+      if (!var_node->inlinks.empty()) {
+        // Var can only come from one op node, so use front
+        auto *prev_op_node = var_node->inlinks.front();
+        if (std::find(op_nodes.begin(), op_nodes.end(), prev_op_node) !=
+            op_nodes.end()) {
+          continue;
+        }
+      }
+      input_var_nodes->insert(var_node);
+    }
+    for (auto &var_node : op_node->outlinks) {
+      if (var_node->outlinks.empty()) {
+        // The next op is empty so this var is actually unused
+        unused_var_nodes->insert(var_node);
+        continue;
+      }
+      // Var can have more than one next op node, So, if any one in the
+      // op_nodes then continue
+      bool next_op_in_nodes = false;
+      for (auto &next_op_node : var_node->outlinks) {
+        if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) !=
+            op_nodes.end()) {
+          next_op_in_nodes = true;
+        }
+      }
+      if (next_op_in_nodes) {
+        local_var_nodes->insert(var_node);
+        continue;
+      }
+      output_var_nodes->insert(var_node);
+    }
+  }
+}
+
+std::unordered_set<const Node *> GetNodes2RM(
+    const std::vector<Node *> &op_nodes,
+    const std::vector<std::unordered_set<Node *>> &excluded_var_nodes) {
+  std::unordered_set<const Node *> nodes2rm(op_nodes.begin(), op_nodes.end());
+  for (auto &op_node : op_nodes) {
+    for (auto &var_node : op_node->inlinks) {
+      if (!nodes2rm.count(var_node)) {
+        nodes2rm.insert(var_node);
+      }
+    }
+    for (auto &var_node : op_node->outlinks) {
+      if (!nodes2rm.count(var_node)) {
+        nodes2rm.insert(var_node);
+      }
+    }
+  }
+  // Excluded nodes should not be removed
+  for (auto &excluded_var_node : excluded_var_nodes) {
+    for (auto &var_node : excluded_var_node) {
+      if (nodes2rm.count(var_node)) {
+        nodes2rm.erase(var_node);
+      }
+    }
+  }
+  return nodes2rm;
+}
+
+static void SortHelper(Node *node,
+                       const std::unordered_set<Node *> &unordered_nodes,
+                       std::unordered_set<const Node *> *visited_nodes,
+                       std::vector<Node *> *ordered_nodes) {
+  for (auto &var_node : node->inlinks) {
+    if (var_node->inlinks.empty()) continue;
+    auto *op_node = var_node->inlinks.front();
+    if (unordered_nodes.count(op_node) && !visited_nodes->count(op_node)) {
+      SortHelper(op_node, unordered_nodes, visited_nodes, ordered_nodes);
+    }
+  }
+  ordered_nodes->push_back(node);
+  visited_nodes->insert(node);
+}
+
+std::vector<Node *> GetTopologicalOrder(
+    const std::unordered_set<Node *> &unordered_nodes) {
+  std::unordered_set<const Node *> visited_nodes;
+  std::vector<Node *> ordered_nodes;
+  for (auto &node : unordered_nodes) {
+    if (!node->IsStmt()) continue;
+    if (visited_nodes.count(node)) continue;
+    SortHelper(node, unordered_nodes, &visited_nodes, &ordered_nodes);
+  }
+  return ordered_nodes;
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_detector.h b/lite/core/mir/subgraph/subgraph_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6873655e976a785383269972221f001196431f8
--- /dev/null
+++ b/lite/core/mir/subgraph/subgraph_detector.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+using SubgraphTeller = std::function<bool(Node*)>;
+
+class SubgraphVisualizer {
+ public:
+  SubgraphVisualizer(SSAGraph* graph,
+                     const std::vector<std::vector<Node*>>& subgraphs)
+      : graph_(graph), subgraphs_(subgraphs) {}
+  std::string operator()();
+
+ protected:
+  SSAGraph* graph_{nullptr};
+  std::vector<std::vector<Node*>> subgraphs_;
+};
+
+/*
+ * Divide the graph into subgraphs according to the specified conditions.
+ * Return the divided clusters, a cluster is consisted of the op nodes in the
+ * subgraph.
+ */
+class SubgraphDetector {
+ public:
+  // This is a simple representation of a graph. The SDNode hold the
+  // pointer of the Node. This is to avoid changing the original graph in the
+  // process of graph analysis.
+  struct node_dat_t;
+  using node_map_t = std::unordered_map<Node*, node_dat_t*>;
+  using node_set_t = std::vector<node_dat_t*>;
+  struct node_dat_t {
+    explicit node_dat_t(Node* _node) : node(_node) {}
+    Node* node;
+    bool marked{false};
+    node_dat_t* union_find_parent{this};
+    node_set_t inlinks{};
+    node_set_t outlinks{};
+    node_dat_t* UnionFindAncestor();
+    void UnionFindCombine(node_dat_t* candidate);
+  };
+  SubgraphDetector(SSAGraph* graph, const SubgraphTeller& teller)
+      : graph_(graph), teller_(teller) {}
+  std::vector<std::vector<Node*>> operator()();
+
+  void FlexibleDFS(const node_set_t& source,
+                   bool reverse,
+                   const std::function<bool(const node_dat_t*)>& enter,
+                   const std::function<bool(const node_dat_t*)>& leave);
+  void InitNodes(node_map_t* nodes);
+  std::vector<std::vector<Node*>> ExtractSubgraphs(node_map_t* nodes);
+
+ protected:
+  SSAGraph* graph_{nullptr};
+  SubgraphTeller teller_;
+};
+
+/*
+ * Replace all of subgraphs with the subgraph ops, a block desc is added into
+ * the subgraph op to wrap the original op nodes, keep all of var nodes of the
+ * original ops nodes as the inputs and outputs of the subgraph op
+ */
+class SubgraphFuser {
+ public:
+  SubgraphFuser(SSAGraph* graph,
+                const SubgraphTeller& teller,
+                int min_subgraph_size)
+      : graph_(graph), teller_(teller), min_subgraph_size_{min_subgraph_size} {}
+  void operator()();
+
+  // Remove the op nodes of the subgraphs and replace with the subgraph ops.
+  void ReplaceNodesWithSubgraphs(SSAGraph* graph,
+                                 const SubgraphTeller& teller,
+                                 int min_subgraph_size);
+  // Create a subgraph node with a block desc to wrap the original op nodes of
+  // the subgraph
+  void InsertNewNode(SSAGraph* graph,
+                     int subgraph_idx,
+                     const std::vector<Node*>& subgraph_nodes);
+
+ protected:
+  SSAGraph* graph_{nullptr};
+  SubgraphTeller teller_;
+  int min_subgraph_size_;
+};
+
+void ExtractInputsOutputs(const std::vector<Node*>& op_nodes,
+                          std::unordered_set<Node*>* input_var_nodes,
+                          std::unordered_set<Node*>* weight_var_nodes,
+                          std::unordered_set<Node*>* output_var_nodes,
+                          std::unordered_set<Node*>* local_var_nodes,
+                          std::unordered_set<Node*>* unused_var_nodes);
+
+std::unordered_set<const Node*> GetNodes2RM(
+    const std::vector<Node*>& op_nodes,
+    const std::vector<std::unordered_set<Node*>>& excluded_var_nodes);
+
+std::vector<Node*> GetTopologicalOrder(
+    const std::unordered_set<Node*>& unordered_nodes);
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_program_pass_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc
similarity index 65%
rename from lite/core/mir/subgraph/subgraph_program_pass_test.cc
rename to lite/core/mir/subgraph/subgraph_detector_test.cc
index 22e20b81d831ff25df090a7565e671b9139122f7..3b0d7c5cd5c8a0d0901750148359f430b6d49894 100644
--- a/lite/core/mir/subgraph/subgraph_program_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -12,68 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
+#include "lite/core/mir/subgraph/subgraph_detector.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
-#include "lite/core/mir/graph_visualize_pass.h"
 #include "lite/core/mir/ssa_graph.h"
 #include "lite/core/program.h"
 #include "lite/model_parser/cpp/program_desc.h"
 #include "lite/model_parser/model_parser.h"
 
 DEFINE_string(model_dir, "", "model_dir");
+DEFINE_string(model_file, "", "model file path of combined protobuf model");
+DEFINE_string(params_file, "", "params file path of combined protobuf model");
 
 namespace paddle {
 namespace lite {
 
-TEST(SubgraphTest, models) {
-  cpp::ProgramDesc program_desc;
-  auto scope = std::make_shared<Scope>();
-  // LoadModelPb(FLAGS_model_dir,
-  //             FLAGS_model_dir + "/model",
-  //             FLAGS_model_dir + "/params",
-  //             scope.get(),
-  //             &program_desc,
-  //             true);
-  LoadModelPb(FLAGS_model_dir, "", "", scope.get(), &program_desc);
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-#ifdef LITE_WITH_ARM
-      Place{TARGET(kARM), PRECISION(kFloat)},
-#endif
-#ifdef LITE_WITH_NPU
-      Place{TARGET(kNPU), PRECISION(kFloat)},
-#endif
-#ifdef LITE_WITH_XPU
-      Place{TARGET(kXPU), PRECISION(kFloat)},
-#endif
-  });
-  lite::Program program(program_desc, scope, valid_places);
-  auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
-  graph->Build(program, valid_places);
-
-  std::vector<std::string> supported_op_types{"concat",
-                                              "conv2d",
-                                              "depthwise_conv2d",
-                                              "batch_norm",
-                                              "scale",
-                                              "pool2d",
-                                              "mul",
-                                              "elementwise_add",
-                                              "softmax",
-                                              "split",
-                                              "relu",
-                                              "reshape2",
-                                              "transpose2"};
-  auto* pass = new mir::subgraph::SubgraphProgramPass;
-  ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1);
-  LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get());
-}
-
-// return output_var_names
+// The helper functions for building model manually
 std::vector<std::string> AddFCDesc(
     cpp::BlockDesc* block_desc,
     const std::shared_ptr<Scope>& scope,
@@ -84,24 +41,23 @@ std::vector<std::string> AddFCDesc(
   static int id = 0;
   std::string prefix = "fc_" + std::to_string(id);
   auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
-  auto* wgt = block_desc->AddVar<cpp::VarDesc>();
-  auto* bias = block_desc->AddVar<cpp::VarDesc>();
-  auto* out = block_desc->AddVar<cpp::VarDesc>();
 
+  auto* wgt = block_desc->AddVar<cpp::VarDesc>();
   wgt->SetName(prefix + "_W");
-  bias->SetName(prefix + "_Bias");
-  out->SetName(prefix + "_Out");
-  std::vector<std::string> out_var_names{prefix + "_Out"};
-
-  auto* wtensor = scope->Var(prefix + "_W")->GetMutable<lite::Tensor>();
+  auto* wtensor = scope->Var(prefix + "_W")->GetMutable<Tensor>();
   wtensor->Resize(wshape);
   wtensor->mutable_data<float>();
 
-  auto* btensor = scope->Var(prefix + "_Bias")->GetMutable<lite::Tensor>();
+  auto* bias = block_desc->AddVar<cpp::VarDesc>();
+  bias->SetName(prefix + "_Bias");
+  auto* btensor = scope->Var(prefix + "_Bias")->GetMutable<Tensor>();
   btensor->Resize({wshape[1]});
   btensor->mutable_data<float>();
 
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  auto* out = block_desc->AddVar<cpp::VarDesc>();
+  out->SetName(prefix + "_Out");
+  std::vector<std::string> out_var_names{prefix + "_Out"};
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
 
   op_desc->SetType("fc");
   op_desc->SetInput("Input", input_var_names);
@@ -127,7 +83,7 @@ std::vector<std::string> AddElementwiseAddDesc(
   out->SetName(prefix + "_Out");
   std::vector<std::string> out_var_names{prefix + "_Out"};
 
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
 
   op_desc->SetType("elementwise_add");
   op_desc->SetInput("X", input_X_names);
@@ -151,7 +107,7 @@ std::vector<std::string> AddFeedDesc(
   out->SetName(prefix + "_Out");
   std::vector<std::string> out_var_names{prefix + "_Out"};
 
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
 
   op_desc->SetType("feed");
   op_desc->SetInput("X", input_X_names);
@@ -174,7 +130,7 @@ std::vector<std::string> AddFetchDesc(
   out->SetName(prefix + "_Out");
   std::vector<std::string> out_var_names{prefix + "_Out"};
 
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
 
   op_desc->SetType("fetch");
   op_desc->SetInput("X", input_X_names);
@@ -184,41 +140,88 @@ std::vector<std::string> AddFetchDesc(
   return out_var_names;
 }
 
-std::unique_ptr<mir::SSAGraph> BuildSimpleNet(
-    cpp::ProgramDesc* program_desc,
-    const std::shared_ptr<Scope>& scope,
-    const std::vector<Place>& valid_places) {
-  program_desc->ClearBlocks();
-  auto* block_desc = program_desc->AddBlock<cpp::BlockDesc>();
+TEST(Subgraph, detect_simple_model) {
+  cpp::ProgramDesc program_desc;
+  std::vector<Place> valid_places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  // Build a simple network
+  program_desc.ClearBlocks();
+  auto* block_desc = program_desc.AddBlock<cpp::BlockDesc>();
   block_desc->ClearOps();
   block_desc->ClearVars();
-
   auto* var_desc = block_desc->AddVar<cpp::VarDesc>();
   var_desc->SetName("feed_var");
-  auto* feed_var = scope->Var("feed_var")->GetMutable<lite::Tensor>();
+  auto* feed_var = scope->Var("feed_var")->GetMutable<Tensor>();
   feed_var->Resize({1, 4});
   auto fc1_out = AddFCDesc(block_desc, scope, {"feed_var"}, {4, 5});
   auto fc2_out = AddFCDesc(block_desc, scope, fc1_out, {5, 2});
-
-  lite::Program program(*program_desc, scope, valid_places);
+  Program program(program_desc, scope, valid_places);
   auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
   graph->Build(program, valid_places);
-
-  return graph;
+  // Apply subgraph detector and check results
+  auto teller = [](mir::Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    auto op_type = stmt.op_type();
+    const std::vector<std::string> supported_types = {"fc"};
+    return std::find(supported_types.begin(), supported_types.end(), op_type) !=
+           supported_types.end();
+  };
+  std::vector<std::vector<mir::Node*>> subgraphs =
+      mir::SubgraphDetector(graph.get(), teller)();
+  ASSERT_EQ(subgraphs.size(), 1);
+  ASSERT_EQ(graph->nodes().size(), 9);
+  mir::SubgraphVisualizer(graph.get(), subgraphs)();
 }
 
-TEST(SubGraphTest, SimpleNet) {
+TEST(Subgraph, detect_custom_model) {
+  if (FLAGS_model_dir.empty() && FLAGS_model_file.empty() &&
+      FLAGS_params_file.empty()) {
+    LOG(INFO) << "Using --model_dir, or --model_file and --params_file to set "
+                 "the path of model files.";
+    return;
+  }
   cpp::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
   auto scope = std::make_shared<Scope>();
-  auto graph = BuildSimpleNet(&program_desc, scope, places);
-
-  std::vector<std::string> supported_op_types{"fc"};
-  auto* pass = new mir::subgraph::SubgraphProgramPass;
-  ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1);
-
-  ASSERT_EQ(graph->nodes().size(), 9);
-  // LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get());
+  LoadModelPb(FLAGS_model_dir,
+              FLAGS_model_file,
+              FLAGS_params_file,
+              scope.get(),
+              &program_desc,
+              !FLAGS_model_file.empty() && !FLAGS_params_file.empty(),
+              false);
+  std::vector<Place> valid_places({
+#ifdef LITE_WITH_ARM
+      Place{TARGET(kARM), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_X86
+      Place{TARGET(kX86), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_NPU
+      Place{TARGET(kNPU), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_XPU
+      Place{TARGET(kXPU), PRECISION(kFloat)},
+#endif
+  });
+  Program program(program_desc, scope, valid_places);
+  auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
+  graph->Build(program, valid_places);
+  // Apply subgraph detector and check results
+  auto teller = [](mir::Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    auto op_type = stmt.op_type();
+    const std::vector<std::string> unsupported_types = {
+        "feed", "fetch", "subgraph"};
+    return std::find(unsupported_types.begin(),
+                     unsupported_types.end(),
+                     op_type) == unsupported_types.end();
+  };
+  std::vector<std::vector<mir::Node*>> subgraphs =
+      mir::SubgraphDetector(graph.get(), teller)();
+  ASSERT_EQ(subgraphs.size(), 1);
+  mir::SubgraphVisualizer(graph.get(), subgraphs)();
 }
 
 }  // namespace lite
diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e2cecd277820ab39b5a25db6159591157982d01
--- /dev/null
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/subgraph/subgraph_pass.h"
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/subgraph/subgraph_detector.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
+void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
+void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/bm/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
+    .BindTargets({TARGET(kNPU)});
+REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
+    .BindTargets({TARGET(kXPU)});
+REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
+    .BindTargets({TARGET(kBM)});
diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ba0f2ab4aa52c384f4175de0eb34475b34fb94c
--- /dev/null
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class NPUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+class XPUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+class BMSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..247795a86ce2cbe962b161311f7845622ee3983e
--- /dev/null
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -0,0 +1,227 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+DEFINE_string(model_file, "", "model file path of combined protobuf model");
+DEFINE_string(params_file, "", "params file path of combined protobuf model");
+DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model");
+DEFINE_string(input_tensor_shape, "1,3,224,224", "shape of input tensors");
+DEFINE_string(input_tensor_type, "float32", "data type of input tensors");
+DEFINE_string(output_tensor_type, "float32", "data type of output tensors");
+
+namespace paddle {
+namespace lite {
+
+// The helper functions for loading and running model from command line and
+// verifying output data
+std::vector<std::string> TypeParsing(std::string text) {
+  std::vector<std::string> types;
+  while (!text.empty()) {
+    size_t index = text.find_first_of(":");
+    std::string type = text.substr(0, index);
+    VLOG(3) << type;
+    types.push_back(type);
+    if (index == std::string::npos) {
+      break;
+    } else {
+      text = text.substr(index + 1);
+    }
+  }
+  return types;
+}
+
+std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
+  std::vector<std::vector<int64_t>> shapes;
+  while (!text.empty()) {
+    size_t index = text.find_first_of(":");
+    std::string slice = text.substr(0, index);
+    std::vector<int64_t> shape;
+    while (!slice.empty()) {
+      size_t index = slice.find_first_of(",");
+      int d = atoi(slice.substr(0, index).c_str());
+      VLOG(3) << d;
+      shape.push_back(d);
+      if (index == std::string::npos) {
+        break;
+      } else {
+        slice = slice.substr(index + 1);
+      }
+    }
+    shapes.push_back(shape);
+    if (index == std::string::npos) {
+      break;
+    } else {
+      text = text.substr(index + 1);
+    }
+  }
+  return shapes;
+}
+
+int64_t ShapeProduction(std::vector<int64_t> shape) {
+  int64_t s = 1;
+  for (int64_t dim : shape) {
+    s *= dim;
+  }
+  return s;
+}
+
+void FillInputTensors(
+    const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
+    const std::vector<std::vector<int64_t>>& input_tensor_shape,
+    const std::vector<std::string>& input_tensor_type,
+    const float value) {
+#define FILL_TENSOR_WITH_TYPE(type)                            \
+  auto input_tensor_data = input_tensor->mutable_data<type>(); \
+  for (int j = 0; j < input_tensor_size; j++) {                \
+    input_tensor_data[j] = static_cast<type>(value);           \
+  }
+  for (int i = 0; i < input_tensor_shape.size(); i++) {
+    auto input_tensor = predictor->GetInput(i);
+    input_tensor->Resize(input_tensor_shape[i]);
+    auto input_tensor_size = ShapeProduction(input_tensor->shape());
+    if (input_tensor_type[i] == "float32") {
+      FILL_TENSOR_WITH_TYPE(float)
+    } else if (input_tensor_type[i] == "int64") {
+      FILL_TENSOR_WITH_TYPE(int64_t)
+    }
+  }
+#undef FILL_TENSOR_WITH_TYPE
+}
+
+void CheckOutputTensors(
+    const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor,
+    const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor,
+    const std::vector<std::string>& output_tensor_type) {
+#define CHECK_TENSOR_WITH_TYPE(type)                                          \
+  auto tar_output_tensor_data = tar_output_tensor->data<type>();              \
+  auto ref_output_tensor_data = ref_output_tensor->data<type>();              \
+  for (size_t j = 0; j < ref_output_tensor_size; j++) {                       \
+    auto abs_diff =                                                           \
+        std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]);     \
+    auto rel_diff = abs_diff / (std::fabs(ref_output_tensor_data[j]) + 1e-6); \
+    VLOG(5) << "val: " << tar_output_tensor_data[j]                           \
+            << " ref: " << ref_output_tensor_data[j]                          \
+            << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff;        \
+    EXPECT_LT(rel_diff, 0.1);                                                 \
+  }
+  for (int i = 0; i < output_tensor_type.size(); i++) {
+    auto tar_output_tensor = tar_predictor->GetOutput(i);
+    auto ref_output_tensor = ref_predictor->GetOutput(i);
+    auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());
+    auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape());
+    EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size);
+    if (output_tensor_type[i] == "float32") {
+      CHECK_TENSOR_WITH_TYPE(float)
+    } else if (output_tensor_type[i] == "int64") {
+      CHECK_TENSOR_WITH_TYPE(int64_t)
+    }
+  }
+#undef CHECK_TENSOR_WITH_TYPE
+}
+
+std::shared_ptr<lite_api::PaddlePredictor> TestModel(
+    const std::string& model_dir,
+    const std::string& model_file,
+    const std::string& params_file,
+    const std::vector<lite_api::Place>& valid_places,
+    const std::vector<std::vector<int64_t>>& input_tensor_shape,
+    const std::vector<std::string>& input_tensor_type,
+    const std::string& optimized_model_dir) {
+  // Generate optimized model
+  lite_api::CxxConfig cxx_config;
+  cxx_config.set_model_dir(model_dir);
+  cxx_config.set_model_file(model_file);
+  cxx_config.set_param_file(params_file);
+  cxx_config.set_valid_places(valid_places);
+  auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
+  predictor->SaveOptimizedModel(optimized_model_dir,
+                                lite_api::LiteModelType::kNaiveBuffer);
+  // Load optimized model
+  lite_api::MobileConfig mobile_config;
+  mobile_config.set_model_from_file(optimized_model_dir + ".nb");
+  mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
+  mobile_config.set_threads(1);
+  predictor = lite_api::CreatePaddlePredictor(mobile_config);
+  FillInputTensors(predictor, input_tensor_shape, input_tensor_type, 1);
+  // Run optimized model
+  for (int i = 0; i < FLAGS_warmup; i++) {
+    predictor->Run();
+  }
+  for (int i = 0; i < FLAGS_repeats; i++) {
+    auto start = GetCurrentUS();
+    predictor->Run();
+    LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
+  }
+  return predictor;
+}
+
+TEST(Subgraph, generate_model_and_check_precision) {
+  if (FLAGS_model_dir.empty() && FLAGS_model_file.empty() &&
+      FLAGS_params_file.empty()) {
+    LOG(INFO) << "Using --model_dir, or --model_file and --params_file to set "
+                 "the path of model files.";
+    return;
+  }
+  // Parsing the shape of input tensors from strings, supported formats:
+  // "1,3,224,224" and "1,3,224,224:1,80"
+  auto input_tensor_shape = ShapeParsing(FLAGS_input_tensor_shape);
+  // Parsing the data type of input and output tensors from strings, supported
+  // formats: "float32" and "float32:int64:int8"
+  auto input_tensor_type = TypeParsing(FLAGS_input_tensor_type);
+  auto output_tensor_type = TypeParsing(FLAGS_output_tensor_type);
+  std::vector<lite_api::Place> valid_places({
+#ifdef LITE_WITH_ARM
+      lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_X86
+      lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+#endif
+  });
+  // Generate and run optimized model on CPU as the reference predictor
+  auto ref_predictor = TestModel(FLAGS_model_dir,
+                                 FLAGS_model_file,
+                                 FLAGS_params_file,
+                                 valid_places,
+                                 input_tensor_shape,
+                                 input_tensor_type,
+                                 FLAGS_optimized_model_dir + "_ref_opt_model");
+// Generate and run optimized model on NPU/XPU as the target predictor
+#ifdef LITE_WITH_NPU
+  valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
+#endif
+#ifdef LITE_WITH_XPU
+  valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
+#endif
+  auto tar_predictor = TestModel(FLAGS_model_dir,
+                                 FLAGS_model_file,
+                                 FLAGS_params_file,
+                                 valid_places,
+                                 input_tensor_shape,
+                                 input_tensor_type,
+                                 FLAGS_optimized_model_dir + "_tar_opt_model");
+  // Check the difference of the output tensors between reference predictor and
+  // target predictor
+  CheckOutputTensors(tar_predictor, ref_predictor, output_tensor_type);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_program_pass.cc b/lite/core/mir/subgraph/subgraph_program_pass.cc
deleted file mode 100644
index 719a01dfd892f83da5e1d9b1efa6df758612acc7..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/subgraph_program_pass.cc
+++ /dev/null
@@ -1,345 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
-#include <memory>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-std::unordered_map<int, std::unordered_set<Node*>>
-SubgraphProgramPass::ClassifySubgraph(const std::unique_ptr<SSAGraph>& graph) {
-  std::unordered_map<int, std::unordered_set<Node*>> op_nodes;
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    int sub_id = stmt.subgraph_id();
-    if (sub_id < 1) continue;
-    if (!op_nodes.count(sub_id)) {
-      op_nodes[sub_id] = std::unordered_set<Node*>();
-    }
-    op_nodes.at(sub_id).insert(item);
-  }
-  return op_nodes;
-}
-
-cpp::OpDesc SubgraphProgramPass::GenGraphOpDesc(
-    const std::string& weight_var_name,
-    const std::vector<std::string>& in_var_names,
-    const std::vector<std::string>& out_var_names) {
-  cpp::OpDesc op_desc;
-  op_desc.SetType("graph_op");
-  op_desc.SetInput("Inputs", in_var_names);
-  op_desc.SetInput("Weight", {weight_var_name});
-  op_desc.SetOutput("Outputs", out_var_names);
-  return op_desc;
-}
-
-void SubgraphProgramPass::InsertNewNode(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::string& weight_var_name,
-    Scope* scope,
-    const std::vector<Place>& valid_places,
-    std::unordered_set<Node*> in_data_vars,
-    std::unordered_set<Node*> in_wgt_vars,
-    std::unordered_set<Node*> out_data_vars,
-    std::unordered_set<Node*> out_unused_vars) {
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (auto i : in_data_vars) {
-    in_var_names.push_back(i->AsArg().name);
-  }
-  for (auto i : out_data_vars) {
-    out_var_names.push_back(i->AsArg().name);
-  }
-
-  auto op_desc = GenGraphOpDesc(weight_var_name, in_var_names, out_var_names);
-
-  auto graph_op = LiteOpRegistry::Global().Create("graph_op");
-  graph_op->Attach(op_desc, scope);
-  auto* new_op_node = graph->GraphCreateInstructNode(graph_op, valid_places);
-
-  for (auto& in_var : in_data_vars) {
-    IR_NODE_LINK_TO(in_var, new_op_node);
-  }
-  for (auto& in_var : in_wgt_vars) {
-    IR_NODE_LINK_TO(in_var, new_op_node);
-  }
-  for (auto& out_var : out_data_vars) {
-    IR_OP_VAR_LINK(new_op_node, out_var);
-  }
-  for (auto& out_var : out_unused_vars) {
-    IR_OP_VAR_LINK(new_op_node, out_var);
-  }
-
-  // add weight node to store pre-compilied NPU model
-  auto new_weight_node = graph->NewArgumentNode(weight_var_name);
-  new_weight_node->AsArg().is_weight = true;
-  new_weight_node->AsArg().is_persist = true;
-  DirectedLink(new_weight_node, new_op_node);
-
-  // assign context
-  auto& inst = new_op_node->AsStmt();
-  inst.picked_kernel().SetContext(
-      ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
-}
-
-void SubgraphProgramPass::SortHelper(
-    Node* node,
-    const std::unordered_set<Node*>& nodes_all,
-    std::unordered_set<const Node*>* visited_nodes,
-    std::vector<Node*>* ret) {
-  for (auto& var_node : node->inlinks) {
-    if (var_node->inlinks.empty()) continue;
-    auto* op_node = var_node->inlinks.front();
-    if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) {
-      SortHelper(op_node, nodes_all, visited_nodes, ret);
-    }
-  }
-  ret->push_back(node);
-  visited_nodes->insert(node);
-}
-
-std::vector<Node*> SubgraphProgramPass::GetTopologicalOrder(
-    const std::unordered_set<Node*>& nodes) {
-  std::unordered_set<const Node*> visited;
-  std::vector<Node*> ret;
-  for (auto& node : nodes) {
-    if (!node->IsStmt()) continue;
-    if (visited.count(node)) continue;
-    SortHelper(node, nodes, &visited, &ret);
-  }
-  return ret;
-}
-
-void SubgraphProgramPass::FindInputOutputVars(
-    const std::unordered_set<Node*>& op_nodes,
-    std::unordered_set<Node*>* in_data_vars,
-    std::unordered_set<Node*>* in_wgt_vars,
-    std::unordered_set<Node*>* out_data_vars,
-    std::unordered_set<Node*>* out_unused_vars) {
-  for (auto& op_node : op_nodes) {
-    for (auto& in_var : op_node->inlinks) {
-      if (in_var->AsArg().is_weight) {
-        in_wgt_vars->insert(in_var);
-        continue;
-      }
-      if (!in_var->inlinks.empty()) {
-        // var can only come from one op node, so use front
-        auto* pre_op_node = in_var->inlinks.front();
-        if (op_nodes.count(pre_op_node)) {
-          continue;
-        }
-      }
-      in_data_vars->insert(in_var);
-    }
-    for (auto& out_var : op_node->outlinks) {
-      if (out_var->outlinks.empty()) {
-        // the next op is empty so this var is actually unused
-        out_unused_vars->insert(out_var);
-        continue;
-      }
-      // var can have more than one next op node
-      // so, if any one in the op_nodes then continue
-      bool next_op_in_nodes = false;
-      for (auto& next_op_node : out_var->outlinks) {
-        if (op_nodes.count(next_op_node)) {
-          next_op_in_nodes = true;
-        }
-      }
-      if (next_op_in_nodes) {
-        continue;
-      }
-
-      out_data_vars->insert(out_var);
-    }
-  }
-}
-
-std::unordered_set<const Node*> SubgraphProgramPass::GetNode2rm(
-    const std::unordered_set<Node*>& op_nodes,
-    const std::vector<std::unordered_set<Node*>>& excluded_nodes) {
-  std::unordered_set<const Node*> nodes2rm(op_nodes.begin(), op_nodes.end());
-  for (auto& op_node : op_nodes) {
-    for (auto& in_var : op_node->inlinks) {
-      if (!nodes2rm.count(in_var)) {
-        nodes2rm.insert(in_var);
-      }
-    }
-    for (auto& out_var : op_node->outlinks) {
-      if (!nodes2rm.count(out_var)) {
-        nodes2rm.insert(out_var);
-      }
-    }
-  }
-  // some nodes should not be removed
-  for (auto& e : excluded_nodes) {
-    for (auto& i : e) {
-      if (nodes2rm.count(i)) {
-        nodes2rm.erase(i);
-      }
-    }
-  }
-  return nodes2rm;
-}
-
-void SubgraphProgramPass::InferOnce(const std::unique_ptr<SSAGraph>& graph) {
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    auto& op = stmt.op();
-    auto scope = op->scope();
-    std::string op_type = op->op_info()->Type();
-    // check the dimension of input variables in the scope, must not be empty !
-    if (op_type == "feed") {
-      auto input_var_names = op->op_info()->output_names();
-      CHECK_GE(input_var_names.size(), 1);
-      for (auto input_var_name : input_var_names) {
-        auto input_var = scope->FindVar(input_var_name);
-        CHECK(input_var) << "No input variable '" << input_var_name
-                         << "' found in scope " << scope;
-        auto input = input_var->GetMutable<lite::Tensor>();
-        CHECK(!input->dims().empty()) << "The dimension of input variable '"
-                                      << input_var_name
-                                      << "' can not be empty.";
-      }
-      continue;
-    }
-    if (op_type == "fetch") {
-      continue;
-    }
-    op->CheckShape();
-    op->InferShape();
-
-#ifndef LITH_WITH_XPU
-    // TOOD(xxx): remove Launch() at last
-    auto& kkks = stmt.kernels();
-    if (!kkks.empty()) {
-      auto& kk = stmt.kernels().front();
-      if (kk) {
-        kk->Launch();
-      }
-    }
-#endif
-  }
-}
-
-void SubgraphProgramPass::InitSubgraphID(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::vector<std::string>& supported_op_types) {
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    stmt.ClearSubgraphID();
-    if (std::find(supported_op_types.begin(),
-                  supported_op_types.end(),
-                  stmt.op_type()) != supported_op_types.end()) {
-      stmt.SetSubgraphID(0);
-      LOG(INFO) << "supported " << stmt.op_type();
-    } else {
-      LOG(INFO) << "======= not supported " << stmt.op_type();
-    }
-  }
-}
-
-// mark current and all output supported nodes
-void SubgraphProgramPass::ChangeAllOutConnectedID(Node* node,
-                                                  int to_id,
-                                                  int from_id) {
-  if (!node) return;
-  if (node->IsStmt()) {
-    auto& stmt = node->AsStmt();
-    if (stmt.subgraph_id() == from_id) {
-      stmt.SetSubgraphID(to_id);
-      for (auto& i : node->outlinks) {
-        ChangeAllOutConnectedID(i, to_id, from_id);
-      }
-    } else {
-      LOG(INFO) << "failed op type:" << stmt.op_type();
-      return;
-    }
-  } else {
-    // this it arg node
-    bool all_out_op_supported = true;
-    for (auto& i : node->outlinks) {
-      if (!i->IsStmt()) return;
-      auto& stmt = i->AsStmt();
-      if (stmt.subgraph_id() < from_id) {
-        all_out_op_supported = false;
-      }
-    }
-    if (!all_out_op_supported) {
-      return;
-    }
-    for (auto& i : node->outlinks) {
-      CHECK(i->IsStmt());
-      auto& stmt = i->AsStmt();
-      if (stmt.subgraph_id() == from_id) {
-        stmt.SetSubgraphID(to_id);
-        for (auto& o : i->outlinks) {
-          ChangeAllOutConnectedID(o, to_id, from_id);
-        }
-      }
-    }
-  }
-}
-
-int SubgraphProgramPass::FuseSubgraphID(
-    const std::unique_ptr<SSAGraph>& graph) {
-  int sub_id = 1;  // id start from 1 not 0
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    // bool inputvar = false;
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    /*
-    if (stmt.subgraph_id() == -1) {
-      for (auto& i : item->outlinks) {
-        for (auto& j : i->outlinks) {
-          if (j->IsStmt()) {
-            auto& jstmt = j->AsStmt();
-            if (jstmt.subgraph_id() == 0) inputvar = true;
-          }
-        }
-      }
-    }
-    */
-    if (stmt.subgraph_id() != 0) continue;
-    ChangeAllOutConnectedID(item, sub_id);
-    sub_id++;
-  }
-  return sub_id - 1;
-}
-
-int SubgraphProgramPass::FuseSubgraph(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::vector<std::string>& supported_op_types) {
-  InitSubgraphID(graph, supported_op_types);
-  return FuseSubgraphID(graph);
-}
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(subgraph_program_pass,
-                  paddle::lite::mir::subgraph::SubgraphProgramPass)
-    .BindTargets({TARGET(kAny)});
diff --git a/lite/core/mir/subgraph/subgraph_program_pass.h b/lite/core/mir/subgraph/subgraph_program_pass.h
deleted file mode 100644
index 24c0233bbb428a71fa5645b23573494b5067d8b1..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/subgraph_program_pass.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-class SubgraphProgramPass : public ProgramPass {
- public:
-  using key2nodes_t = std::map<std::string, Node*>;
-
-  // make all the linked ops in subgraph with same subgraph_id
-  // return the fused subgraph numbers
-  int FuseSubgraph(const std::unique_ptr<SSAGraph>& graph,
-                   const std::vector<std::string>& supported_op_types);
-
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override{};
-
- protected:
-  void InferOnce(const std::unique_ptr<SSAGraph>& graph);
-
-  // clear all subgraph id and mark all ops, which could be fuse, as id zero
-  void InitSubgraphID(const std::unique_ptr<SSAGraph>& graph,
-                      const std::vector<std::string>& supported_op_types);
-
-  // make all the linked ops in subgraph with same subgraph_id
-  // return the fused subgraph numbers
-  int FuseSubgraphID(const std::unique_ptr<SSAGraph>& graph);
-
-  // // GenerateFusedGraph:
-  // std::unique_ptr<SSAGraph> GenerateFusedGraph(const
-  // std::unique_ptr<SSAGraph>& graph, int sub_num);
-  void ChangeAllOutConnectedID(Node* node, int to_id, int from_id = 0);
-
-  // Below function cloud be useful in child classes //
-  // classify node by subgraph id
-  std::unordered_map<int, std::unordered_set<Node*>> ClassifySubgraph(
-      const std::unique_ptr<SSAGraph>& graph);
-
-  // generate the graph op desc
-  cpp::OpDesc GenGraphOpDesc(const std::string& weight_var_name,
-                             const std::vector<std::string>& in_var_names,
-                             const std::vector<std::string>& out_var_names);
-
-  // insert a new graph op node
-  void InsertNewNode(const std::unique_ptr<SSAGraph>& graph,
-                     const std::string& weight_var_name,
-                     Scope* scope,
-                     const std::vector<Place>& valid_places,
-                     std::unordered_set<Node*> in_data_vars,
-                     std::unordered_set<Node*> in_wgt_vars,
-                     std::unordered_set<Node*> out_data_vars,
-                     std::unordered_set<Node*> out_unused_vars);
-
-  // Sort and return the topology order of nodes set
-  std::vector<Node*> GetTopologicalOrder(
-      const std::unordered_set<Node*>& nodes);
-
-  // find all input data vars, input weight vars,
-  // output data vars and output vars from the nodes
-  void FindInputOutputVars(const std::unordered_set<Node*>& op_nodes,
-                           std::unordered_set<Node*>* in_data_vars,
-                           std::unordered_set<Node*>* in_wgt_vars,
-                           std::unordered_set<Node*>* out_data_vars,
-                           std::unordered_set<Node*>* out_unused_vars);
-
-  // return the node to remove in the subgraph
-  std::unordered_set<const Node*> GetNode2rm(
-      const std::unordered_set<Node*>& op_nodes,
-      const std::vector<std::unordered_set<Node*>>& excluded_nodes);
-
- private:
-  // sort nodes to operational sequence
-  void SortHelper(Node* node,
-                  const std::unordered_set<Node*>& nodes_all,
-                  std::unordered_set<const Node*>* visited_nodes,
-                  std::vector<Node*>* ret);
-};
-
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
index b008faa687474a88988adb9da81c594306298b26..ae74bd8d4d5647139a13509dfda0bb2b41ecc5c7 100644
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -16,6 +16,7 @@
 #include <list>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
@@ -35,18 +36,23 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
   CHECK(!valid_places_.empty());
 
+  // record the copied node.
+  std::unordered_map<std::string, Node*> copied_nodes;
+
   for (auto& node : nodes) {
     if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
     auto inlinks = node->inlinks;
     for (auto* in : inlinks) {
-      ComplementInputs(graph.get(), node, in);
+      ComplementInputs(graph.get(), node, in, &copied_nodes);
     }
   }
 }
 
-void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph,
-                                               Node* inst_node,
-                                               Node* in) {
+void TypeTargetTransformPass::ComplementInputs(
+    SSAGraph* graph,
+    Node* inst_node,
+    Node* in,
+    std::unordered_map<std::string, Node*>* copied_nodes) {
   // If this input is out of date.
   if (inst_node->inlinks.end() ==
       std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in))
@@ -67,8 +73,13 @@ void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph,
             << " for kernel " << inst.op()->DebugString() << " "
             << *in->AsArg().type << " -> " << *decl_arg_type;
     // Add an IoCopy instruction to make the input compatible with other dist.
-    AddIoCopyInst(
-        *in->AsArg().type, *decl_arg_type, in, graph, inst_node, valid_places_);
+    AddIoCopyInst(*in->AsArg().type,
+                  *decl_arg_type,
+                  in,
+                  graph,
+                  inst_node,
+                  copied_nodes,
+                  valid_places_);
   }
 }
 
@@ -78,128 +89,132 @@ void TypeTargetTransformPass::AddIoCopyInst(
     Node* in,
     SSAGraph* graph,
     Node* inst_node,
+    std::unordered_map<std::string, Node*>* copied_nodes,
     const std::vector<Place>& valid_places) {
   CHECK(!valid_places.empty()) << "valid_place should be set";
   // var -> new_transform_op -> new_var -> inst
   // So there will be a new Argument node and a new IoCopy Statement Node.
 
   CHECK(in->IsArg());
+
   // auto node_id = [&] { return graph->nodes().size(); };
   auto io_copy_output_name =
       string_format("%s/target_trans", in->AsArg().name.c_str());
   // string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
-  // TODO(MyPandaShaoxiang) should set same place with input?
-  auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
-  // Set the place for io_copy_output_arg node, the target should be equal to
-  // to.target()
-  // The precision and layout should be equal to from.precision(), from.layout()
-  io_copy_output_arg->AsArg().type =
-      LiteType::GetTensorTy(to.target(), from.precision(), from.layout());
-  auto* io_copy_inst = graph->NewInstructNode();
-
-  bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
-  std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy";
-  io_copy_output_arg->AsArg().is_persist = in_persist;
-  // create Op and kernels.
-  auto io_copy_op = LiteOpRegistry::Global().Create(io_copy_type);
-  CHECK(io_copy_op) << "create op [" << io_copy_op << "] failed";
-  // CHECK(io_copy_op);
-  // Create the new var manually.
-  inst_node->AsStmt().op()->scope()->Var(io_copy_output_name);
-
-  // Create IoCopy Instruction.
-  cpp::OpDesc op_desc;
-  op_desc.SetType(io_copy_type);
-  op_desc.SetInput("Input", {in->AsArg().name});
-  op_desc.SetOutput("Out", {io_copy_output_name});
-
-  io_copy_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
-  auto kernels = io_copy_op->CreateKernels(valid_places);
-  // fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type
-  bool is_found = false;
-  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
-  for (auto& kernel : kernels) {
-    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
-    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-
-    VLOG(4) << "------ kernel info -------";
-    VLOG(4) << "*in_arg_ty(io_copy kernel input):" << *in_arg_ty;
-    VLOG(4) << "from(last kernel output):" << from;
-    VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty;
-    VLOG(4) << "to:" << to << "\n";
-
-    // kernel choose branch for opencl backend
-    //   judge inst's target whether is kOpenCL
-    //   Note: to == *decl_arg_type == in of inst, not output of last inst
-    // ignore [layout check] for layout between [to] and [from]
-    //   Because all of origin opencl insts in model, are not default layout
-    //   NCHW,
-    //   so skip layout check.
-    // detailed node info see below:
-    //     [*in->AsArg().type] -> [from]: out of inst's previous kernel
-    //     [*decl_arg_type] -> [to]: input of inst, not output of last
-    //     [in_arg_ty]: in of io_copy
-    //     [out_arg_ty]: out of io_copy
-    //
-    // noto: replace LITE_WITH_OPENCL macro with judge input and output target
-    // of io_copy
-    if ((in_arg_ty->target() == TARGET(kOpenCL) ||
-         out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
-        (TargetCompatibleTo(*in_arg_ty, from) &&
-         PrecisionCompatibleTo(*in_arg_ty, from) &&
-         DeviceCompatibleTo(*in_arg_ty, from) &&
-         TargetCompatibleTo(*out_arg_ty, to))) {
-      VLOG(4) << "picked, opencl found";
-      is_found = true;
-    } else if (TypeCompatible(*in_arg_ty, from) &&
-               out_arg_ty->target() == to.target()) {
-      VLOG(4) << "picked";
-      is_found = true;
-    }
 
-    if (is_found) {
-      selected_kernels.emplace_back(std::move(kernel));
-      // we pick the kernel
-      io_copy_inst->AsStmt(
-          io_copy_type, std::move(selected_kernels), io_copy_op);
-      break;
+  if (copied_nodes->count(in->AsArg().name)) {
+    // Remove the old link
+    RemoveDirectedLink(in, inst_node);
+
+    // Update the original instruction OpDesc.
+    // Update its input to the io_copy_output_name
+    // Add new link, newarg->inst
+    DirectedLink(copied_nodes->at(in->AsArg().name),
+                 inst_node);  // [io_copy kernel]'s output -> [current kernel]
+
+    UpdateInstNode(in, graph, inst_node, io_copy_output_name);
+  } else {
+    // TODO(MyPandaShaoxiang) should set same place with input?
+    auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
+    // Set the place for io_copy_output_arg node, the target should be equal to
+    // to.target()
+    // The precision and layout should be equal to from.precision(),
+    // from.layout()
+    io_copy_output_arg->AsArg().type =
+        LiteType::GetTensorTy(to.target(), from.precision(), from.layout());
+    auto* io_copy_inst = graph->NewInstructNode();
+
+    bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
+    std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy";
+    io_copy_output_arg->AsArg().is_persist = in_persist;
+    // create Op and kernels.
+    auto io_copy_op = LiteOpRegistry::Global().Create(io_copy_type);
+    CHECK(io_copy_op) << "create op [" << io_copy_op << "] failed";
+    // CHECK(io_copy_op);
+    // Create the new var manually.
+    inst_node->AsStmt().op()->scope()->Var(io_copy_output_name);
+
+    // Create IoCopy Instruction.
+    cpp::OpDesc op_desc;
+    op_desc.SetType(io_copy_type);
+    op_desc.SetInput("Input", {in->AsArg().name});
+    op_desc.SetOutput("Out", {io_copy_output_name});
+
+    io_copy_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+    auto kernels = io_copy_op->CreateKernels(valid_places);
+    // fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type
+    bool is_found = false;
+    std::vector<std::unique_ptr<KernelBase>> selected_kernels;
+    for (auto& kernel : kernels) {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+
+      VLOG(4) << "------ kernel info -------";
+      VLOG(4) << "*in_arg_ty(io_copy kernel input):" << *in_arg_ty;
+      VLOG(4) << "from(last kernel output):" << from;
+      VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty;
+      VLOG(4) << "to:" << to << "\n";
+
+      // kernel choose branch for opencl backend
+      //   judge inst's target whether is kOpenCL
+      //   Note: to == *decl_arg_type == in of inst, not output of last inst
+      // ignore [layout check] for layout between [to] and [from]
+      //   Because all of origin opencl insts in model, are not default layout
+      //   NCHW,
+      //   so skip layout check.
+      // detailed node info see below:
+      //     [*in->AsArg().type] -> [from]: out of inst's previous kernel
+      //     [*decl_arg_type] -> [to]: input of inst, not output of last
+      //     [in_arg_ty]: in of io_copy
+      //     [out_arg_ty]: out of io_copy
+      //
+      // noto: replace LITE_WITH_OPENCL macro with judge input and output target
+      // of io_copy
+      if ((in_arg_ty->target() == TARGET(kOpenCL) ||
+           out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
+          (TargetCompatibleTo(*in_arg_ty, from) &&
+           PrecisionCompatibleTo(*in_arg_ty, from) &&
+           DeviceCompatibleTo(*in_arg_ty, from) &&
+           TargetCompatibleTo(*out_arg_ty, to))) {
+        VLOG(4) << "picked, opencl found";
+        is_found = true;
+      } else if (TypeCompatible(*in_arg_ty, from) &&
+                 out_arg_ty->target() == to.target()) {
+        VLOG(4) << "picked";
+        is_found = true;
+      }
+
+      if (is_found) {
+        selected_kernels.emplace_back(std::move(kernel));
+        // we pick the kernel
+        io_copy_inst->AsStmt(
+            io_copy_type, std::move(selected_kernels), io_copy_op);
+        (*copied_nodes)[in->AsArg().name] = io_copy_output_arg;
+        break;
+      }
+
+      VLOG(4) << "not picked";
     }
 
-    VLOG(4) << "not picked";
-  }
+    CHECK(is_found) << "Can't find a io_copy  kernel for io_copy op: " << from
+                    << ":" << in->AsArg().name << " -> " << to << ":"
+                    << inst_node->AsStmt().op_info()->Type();
+    // Remove the old link
+    RemoveDirectedLink(in, inst_node);
 
-  CHECK(is_found) << "Can't find a io_copy  kernel for io_copy op: " << from
-                  << ":" << in->AsArg().name << " -> " << to << ":"
-                  << inst_node->AsStmt().op_info()->Type();
-  // Remove the old link
-  RemoveDirectedLink(in, inst_node);
-
-  // Update the original instruction OpDesc.
-  // Update its input to the io_copy_output_name
-  // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
-  DirectedLink(in, io_copy_inst);  // [last kernel]'s output -> [io_copy kernel]
-  DirectedLink(
-      io_copy_inst,
-      io_copy_output_arg);  // [io_copy kernel] -> [io_copy kernel]'s output
-  DirectedLink(io_copy_output_arg,
-               inst_node);  // [io_copy kernel]'s output -> [current kernel]
+    // Update the original instruction OpDesc.
+    // Update its input to the io_copy_output_name
+    // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
+    DirectedLink(in,
+                 io_copy_inst);  // [last kernel]'s output -> [io_copy kernel]
+    DirectedLink(
+        io_copy_inst,
+        io_copy_output_arg);  // [io_copy kernel] -> [io_copy kernel]'s output
+    DirectedLink(io_copy_output_arg,
+                 inst_node);  // [io_copy kernel]'s output -> [current kernel]
 
-  // reset opdesc and update kernel information
-  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
-                in->AsArg().name,
-                io_copy_output_name);
-  auto original_selected_kernel =
-      std::move(inst_node->AsStmt().kernels().front());
-  auto update_op_info = *inst_node->AsStmt().op_info();
-  // ResetOp() will change the Stmt op_info_ value,
-  // after that the old op_info_ value will be nullified.
-  // So, we can't pass `*inst_node->AsStmt().op_info()` into ResetOp.
-  // `update_op_info` is the copy of `*inst_node->AsStmt().op_info().
-  // Whenever update the op_info of a stmt, we should call its ResetOp().
-  inst_node->AsStmt().ResetOp(update_op_info, graph->valid_places());
-  inst_node->AsStmt().kernels().clear();
-  inst_node->AsStmt().kernels().emplace_back(
-      std::move(original_selected_kernel));
+    UpdateInstNode(in, graph, inst_node, io_copy_output_name);
+  }
 
   std::string tmp;
   if (inst_node->AsStmt().op_info()->GetInputArgname("a", &tmp)) {
@@ -220,6 +235,28 @@ void TypeTargetTransformPass::SetValidPlaces(
   valid_places_ = valid_places;
 }
 
+void TypeTargetTransformPass::UpdateInstNode(Node* in,
+                                             SSAGraph* graph,
+                                             Node* inst_node,
+                                             std::string io_copy_output_name) {
+  // reset opdesc and update kernel information
+  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
+                in->AsArg().name,
+                io_copy_output_name);
+  auto original_selected_kernel =
+      std::move(inst_node->AsStmt().kernels().front());
+  auto update_op_info = *inst_node->AsStmt().op_info();
+  // ResetOp() will change the Stmt op_info_ value,
+  // after that the old op_info_ value will be nullified.
+  // So, we can't pass `*inst_node->AsStmt().op_info()` into ResetOp.
+  // `update_op_info` is the copy of `*inst_node->AsStmt().op_info().
+  // Whenever update the op_info of a stmt, we should call its ResetOp().
+  inst_node->AsStmt().ResetOp(update_op_info, graph->valid_places());
+  inst_node->AsStmt().kernels().clear();
+  inst_node->AsStmt().kernels().emplace_back(
+      std::move(original_selected_kernel));
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/mir/type_target_cast_pass.h b/lite/core/mir/type_target_cast_pass.h
index 8a8cfaf9f9282cb477f7b9dd404d6f869333221b..e9a275882f7c2cb813c1c0b8add5cc4ca89b0c8b 100644
--- a/lite/core/mir/type_target_cast_pass.h
+++ b/lite/core/mir/type_target_cast_pass.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "lite/core/mir/pass.h"
 #include "lite/core/op_registry.h"
@@ -44,13 +45,17 @@ class TypeTargetTransformPass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 
-  void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in);
+  void ComplementInputs(SSAGraph* graph,
+                        Node* inst_node,
+                        Node* in,
+                        std::unordered_map<std::string, Node*>* copied_nodes);
 
   void AddIoCopyInst(const Type& from,
                      const Type& to,
                      Node* in,
                      SSAGraph* graph,
                      Node* inst_node,
+                     std::unordered_map<std::string, Node*>* copied_nodes,
                      const std::vector<Place>& valid_places);
 
   void SetValidPlaces(const std::vector<Place>& valid_places);
@@ -58,6 +63,11 @@ class TypeTargetTransformPass : public ProgramPass {
   const std::vector<Place>& valid_places() const { return valid_places_; }
 
  private:
+  void UpdateInstNode(Node* in,
+                      SSAGraph* graph,
+                      Node* inst_node,
+                      std::string io_copy_output_name);
+
   std::vector<Place> valid_places_;
 };
 
diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h
index 3f5d161a56aafa7fd9d058fd404e65cb04572116..875bf23082a24cb6fcae878b46cc9dcdbb2b76f7 100644
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -48,6 +48,10 @@ class VariablePlaceInferencePass : public DebugPass {
   void CheckAllArgumentTypeDetermined(SSAGraph* graph) {
     for (auto& node : graph->mutable_nodes()) {
       if (node.IsArg()) {
+        if (node.inlinks.size() == 0 && node.outlinks.size() == 0) {
+          // empty node
+          continue;
+        }
         CHECK(node.AsArg().type) << "node " << node.AsArg().name
                                  << " type not determined, " << &node;
       }
@@ -129,6 +133,17 @@ class VariablePlaceInferencePass : public DebugPass {
           } else {
             x_in->AsArg().type = type;
           }
+        } else if (x_in->AsArg().type->target() == TARGET(kUnk) &&
+                   x_in->AsArg().type->precision() != PRECISION(kUnk) &&
+                   x_in->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+          // If is quantization, infer the Int8 type.
+          if (type->precision() == PRECISION(kInt8)) {
+            x_in->AsArg().type = type;
+          } else {
+            PrecisionType tmp_ptype = x_in->AsArg().type->precision();
+            x_in->AsArg().type = LiteType::GetTensorTy(
+                type->target(), tmp_ptype, type->layout());
+          }
         }
       }
 
@@ -149,6 +164,17 @@ class VariablePlaceInferencePass : public DebugPass {
           } else {
             x_out->AsArg().type = type;
           }
+        } else if (x_out->AsArg().type->target() == TARGET(kUnk) &&
+                   x_out->AsArg().type->precision() != PRECISION(kUnk) &&
+                   x_out->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+          // If is quantization, infer the Int8 type.
+          if (type->precision() == PRECISION(kInt8)) {
+            x_out->AsArg().type = type;
+          } else {
+            PrecisionType tmp_ptype = x_out->AsArg().type->precision();
+            x_out->AsArg().type = LiteType::GetTensorTy(
+                type->target(), tmp_ptype, type->layout());
+          }
         }
       }
     }
diff --git a/lite/core/mir/weight_quantization_preprocess_pass.cc b/lite/core/mir/weight_quantization_preprocess_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7889a54903f2a1d194fb3eade0bd92670b36699
--- /dev/null
+++ b/lite/core/mir/weight_quantization_preprocess_pass.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/weight_quantization_preprocess_pass.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void WeightQuantizationPreprocessPass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  std::vector<std::string> weight_quantized_op = {"conv2d", "depthwise_conv2d"};
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    if (node->IsStmt() &&
+        std::find(weight_quantized_op.begin(),
+                  weight_quantized_op.end(),
+                  node->AsStmt().op_type()) != weight_quantized_op.end()) {
+      auto* scope = node->stmt()->op()->scope();
+      auto* op_desc = node->stmt()->mutable_op_info();
+      if (op_desc->HasAttr("quantize_weight_bits")) {
+        for (auto& input_name : op_desc->input_vars()) {
+          std::string scale_name = input_name + "_quant_scale";
+          if (op_desc->HasAttr(scale_name)) {
+            VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name;
+            auto input_tensor =
+                scope->FindVar(input_name)->GetMutable<lite::Tensor>();
+            int weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
+            auto input_scale = op_desc->GetAttr<std::vector<float>>(scale_name);
+            // scale length is equal to weight out channel
+            std::vector<float> scale_list(weight_out_channel, input_scale[0]);
+            op_desc->SetAttr(scale_name, scale_list);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(weight_quantization_preprocess_pass,
+                  paddle::lite::mir::WeightQuantizationPreprocessPass)
+    .BindTargets({TARGET(kAny)});
diff --git a/lite/core/mir/weight_quantization_preprocess_pass.h b/lite/core/mir/weight_quantization_preprocess_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..76a35c6b443c692ec08688abd4c10680be62b8af
--- /dev/null
+++ b/lite/core/mir/weight_quantization_preprocess_pass.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/core/mir/pass.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+/*
+ * If the model is quantized by WeightQuantization in PostTrainingQuantization,
+ * the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is
+ * int, and the scale is save in the quantized ops.
+ * WeightQuantizationPreprocessPass obtains the scale value, expands the
+ * scale value to a list, and save the list in the quantized ops.
+ */
+class WeightQuantizationPreprocessPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
index 3b8b350ad82f2cc1ce296b1ad74a6e322abec8ff..b49670eefb8b2c6aae30cb041de4d055a2b9964c 100644
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -40,6 +40,18 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
       return Create<TARGET(target__),                                        \
                     PRECISION(precision__),                                  \
                     DATALAYOUT(kNHWC)>(op_type);                             \
+    case DATALAYOUT(kImageDefault):                                          \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageDefault)>(op_type);                     \
+    case DATALAYOUT(kImageFolder):                                           \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageFolder)>(op_type);                      \
+    case DATALAYOUT(kImageNW):                                               \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageNW)>(op_type);                          \
     default:                                                                 \
       LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \
   }
@@ -54,6 +66,8 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
       CREATE_KERNEL1(target__, kFP16);                  \
     case PRECISION(kAny):                               \
       CREATE_KERNEL1(target__, kAny);                   \
+    case PRECISION(kInt32):                             \
+      CREATE_KERNEL1(target__, kInt32);                 \
     case PRECISION(kInt64):                             \
       CREATE_KERNEL1(target__, kInt64);                 \
     default:                                            \
@@ -86,6 +100,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
     case TARGET(kFPGA): {
       CREATE_KERNEL(kFPGA);
     } break;
+    case TARGET(kBM): {
+      CREATE_KERNEL(kBM);
+    } break;
     default:
       CHECK(false) << "not supported kernel target " << TargetToStr(target);
   }
@@ -115,6 +132,8 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kCUDA, kAny, kNCHW);
   INIT_FOR(kCUDA, kAny, kAny);
   INIT_FOR(kCUDA, kInt8, kNHWC);
+  INIT_FOR(kCUDA, kInt64, kNCHW);
+  INIT_FOR(kCUDA, kInt64, kNHWC);
 
   INIT_FOR(kHost, kFloat, kNCHW);
   INIT_FOR(kHost, kAny, kNCHW);
@@ -134,6 +153,7 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kARM, kInt8, kNCHW);
   INIT_FOR(kARM, kAny, kNCHW);
   INIT_FOR(kARM, kAny, kAny);
+  INIT_FOR(kARM, kInt32, kNCHW);
 
   INIT_FOR(kOpenCL, kFloat, kNCHW);
   INIT_FOR(kOpenCL, kFloat, kNHWC);
@@ -142,6 +162,17 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kOpenCL, kFloat, kAny);
   INIT_FOR(kOpenCL, kInt8, kNCHW);
   INIT_FOR(kOpenCL, kAny, kAny);
+  INIT_FOR(kOpenCL, kFP16, kNCHW);
+  INIT_FOR(kOpenCL, kFP16, kNHWC);
+  INIT_FOR(kOpenCL, kFP16, kImageDefault);
+  INIT_FOR(kOpenCL, kFP16, kImageFolder);
+  INIT_FOR(kOpenCL, kFP16, kImageNW);
+  INIT_FOR(kOpenCL, kFloat, kImageDefault);
+  INIT_FOR(kOpenCL, kFloat, kImageFolder);
+  INIT_FOR(kOpenCL, kFloat, kImageNW);
+  INIT_FOR(kOpenCL, kAny, kImageDefault);
+  INIT_FOR(kOpenCL, kAny, kImageFolder);
+  INIT_FOR(kOpenCL, kAny, kImageNW);
 
   INIT_FOR(kNPU, kFloat, kNCHW);
   INIT_FOR(kNPU, kInt8, kNCHW);
@@ -158,6 +189,11 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kFPGA, kFloat, kNHWC);
   INIT_FOR(kFPGA, kAny, kNHWC);
   INIT_FOR(kFPGA, kAny, kAny);
+
+  INIT_FOR(kBM, kFloat, kNCHW);
+  INIT_FOR(kBM, kInt8, kNCHW);
+  INIT_FOR(kBM, kAny, kNCHW);
+  INIT_FOR(kBM, kAny, kAny);
 #undef INIT_FOR
 }
 
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index 1c67ee8f3dcafe30d9bda587d62233d0e715071e..a49682eea68240bfa178eb3d3351b8c7fb41048d 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -145,6 +145,15 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kARM),
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kARM),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC)> *,  //
 
               KernelRegistryForTarget<TARGET(kOpenCL),
                                       PRECISION(kFloat),
@@ -167,6 +176,39 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kOpenCL),
                                       PRECISION(kAny),
                                       DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageNW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageNW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageNW)> *,  //
 
               KernelRegistryForTarget<TARGET(kNPU),
                                       PRECISION(kAny),
@@ -188,6 +230,16 @@ class KernelRegistry final {
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
 
+              KernelRegistryForTarget<TARGET(kBM),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kBM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kBM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+
               KernelRegistryForTarget<TARGET(kFPGA),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW)> *,  //
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index a50ff3e6110a09851791190239358445141c8657..ddd94484ac4bb8d96d5c55300c985d21b44f1843 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 #include "lite/core/mir/generate_program_pass.h"
@@ -26,12 +27,6 @@
 #include "lite/core/program.h"
 #include "lite/core/types.h"
 #include "lite/model_parser/model_parser.h"
-#ifdef LITE_WITH_NPU
-#include "lite/core/mir/subgraph/generate_npu_program_pass.h"
-#endif
-#ifdef LITE_WITH_XPU
-#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
-#endif
 
 namespace paddle {
 namespace lite {
@@ -50,21 +45,6 @@ class Optimizer {
     valid_places_ = valid_places;
     CHECK(!valid_places.empty()) << "At least one valid_place should be set";
     CHECK(!graph_) << "duplicate optimize found";
-    auto valid_places_has_target = [&](TargetType t) -> bool {
-      for (auto& p : valid_places) {
-        if (p.target == t) {
-          return true;
-        }
-      }
-      return false;
-    };
-    std::map<std::string, bool> lite_with_targets{
-        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
-        {"kNPU", valid_places_has_target(TARGET(kNPU))},
-        {"kXPU", valid_places_has_target(TARGET(kXPU))}};
-    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
-    VLOG(4) << "lite_with_targets['kNPU']:" << lite_with_targets["kNPU"];
-    VLOG(4) << "lite_with_targets['kXPU']:" << lite_with_targets["kXPU"];
 
     graph_.reset(new mir::SSAGraph);
     graph_->Build(program, valid_places);
@@ -75,19 +55,24 @@ class Optimizer {
 
     if (passes.empty()) {
       std::vector<std::string> passes_local{
-          {"lite_quant_dequant_fuse_pass",     //
-           "lite_conv_elementwise_fuse_pass",  // conv-elemwise-bn
-           "lite_conv_bn_fuse_pass",           //
-           "lite_conv_elementwise_fuse_pass",  // conv-bn-elemwise
+          {"lite_quant_dequant_fuse_pass",         //
+           "weight_quantization_preprocess_pass",  //
+           "lite_conv_elementwise_fuse_pass",      // conv-elemwise-bn
+           "lite_conv_bn_fuse_pass",               //
+           "lite_conv_elementwise_fuse_pass",      // conv-bn-elemwise
            // TODO(Superjomn) Refine the fusion related design to select fusion
            // kernels for devices automatically.
            "lite_conv_activation_fuse_pass",              //
+           "lite_var_conv_2d_activation_fuse_pass",       //
            "lite_fc_fuse_pass",                           //
            "lite_shuffle_channel_fuse_pass",              //
            "lite_transpose_softmax_transpose_fuse_pass",  //
            "lite_interpolate_fuse_pass",                  //
            "identity_scale_eliminate_pass",               //
-#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+           "elementwise_mul_constant_eliminate_pass",     //
+           "lite_sequence_pool_concat_fuse_pass",         //
+#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
+    (defined LITE_WITH_ARM)
            "lite_elementwise_add_activation_fuse_pass",  //
 #endif
            "static_kernel_pick_pass",        // pick original kernel from graph
@@ -122,13 +107,10 @@ class Optimizer {
            "argument_type_display_pass",
 
            "runtime_context_assign_pass",
-           "argument_type_display_pass"}};
-      if ((!lite_with_targets["kOpenCL"]) && (!lite_with_targets["kNPU"]) &&
-          (!lite_with_targets["kXPU"])) {
-        // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in OpenCL
-        // kernel
-        passes_local.emplace_back("memory_optimize_pass");
-      }
+           "argument_type_display_pass",
+           "memory_optimize_pass",
+           "npu_subgraph_pass",
+           "xpu_subgraph_pass"}};
       RunPasses(passes_local);
     } else {
       RunPasses(passes);
@@ -140,40 +122,6 @@ class Optimizer {
 
   // Generate a new program based on the mir graph.
   std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
-#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
-    auto target_place = Place{
-#ifdef LITE_WITH_NPU
-        TARGET(kNPU),
-#endif
-#ifdef LITE_WITH_XPU
-        TARGET(kXPU),
-#endif
-        PRECISION(kFloat)};
-    if (std::find(valid_places_.begin(), valid_places_.end(), target_place) !=
-        valid_places_.end()) {
-#ifdef LITE_WITH_NPU
-      auto pass = mir::PassManager::Global()
-                      .LookUp<mir::subgraph::GenerateNPUProgramPass>(
-                          "generate_npu_program_pass");
-#endif
-
-#ifdef LITE_WITH_XPU
-      auto pass = mir::PassManager::Global()
-                      .LookUp<mir::subgraph::GenerateXPUProgramPass>(
-                          "generate_xpu_program_pass");
-#endif
-      try {
-        pass->Apply(graph_);
-        auto program = pass->GenProgram();
-        CHECK(exec_scope_);
-        program->set_exec_scope(exec_scope_);
-        return program;
-      } catch (...) {
-        LOG(WARNING) << "Build " << TargetToStr(target_place.target)
-                     << " program failed!";
-      }
-    }
-#endif
     auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
         "generate_program_pass");
     pass->Apply(graph_);
@@ -215,14 +163,16 @@ class Optimizer {
     for (auto& x : passes) {
       LOG(INFO) << "== Running pass: " << x;
       mir::Pass* pass = mir::PassManager::Global().LookUp(x);
-      CHECK(pass) << "Can not find pass: " << x;
-      bool matched = false;
+      if (!pass) {
+        LOG(INFO) << "   - Skip " << x << " because the pass isn't found.";
+        continue;
+      }
+      std::set<TargetType> targets;
       for (const auto& place : valid_places_) {
-        if (PassMatchesTarget(*pass, place.target)) {
-          matched = true;
-        }
+        targets.insert(place.target);
       }
-      matched = matched && PassMatchesKernels(*pass);
+      bool matched =
+          PassMatchesTarget(*pass, targets) && PassMatchesKernels(*pass);
       if (!matched) {
         LOG(INFO) << "   - Skip " << x
                   << " because the target or kernel does not match.";
diff --git a/lite/core/profile/CMakeLists.txt b/lite/core/profile/CMakeLists.txt
index 54a239024413834cb30c6e135c378d10480863e7..b7ddd810af46a25e2c331c2f0364a72f466dc636 100644
--- a/lite/core/profile/CMakeLists.txt
+++ b/lite/core/profile/CMakeLists.txt
@@ -5,4 +5,5 @@ endif()
 lite_cc_library(basic_profiler SRCS basic_profiler.cc DEPS gflags)
 lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler)
  
- 
+lite_cc_library(lite_profiler SRCS profiler.cc DEPS context)
+lite_cc_test(test_lite_timer SRCS test_timer.cc DEPS lite_profiler)
diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4d0e3c0afbe1f9df4e381a502e1800a3d58ba68
--- /dev/null
+++ b/lite/core/profile/profiler.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/profile/profiler.h"
+#include <map>
+#include <string>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+namespace {
+auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
+  return (c1.target < c2.target) || (c1.op_type < c2.op_type) ||
+         (c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark);
+};
+}
+
+std::map<Type, std::string> TypeStr{
+    {Type::kUnk, "Unknown"},
+    {Type::kCreate, "Create"},
+    {Type::kDispatch, "Dispatch"},
+};
+
+StatisUnit::StatisUnit(const OpCharacter& ch) : character(ch) {
+  create_t.reset(new DeviceTimer<TargetType::kHost>());
+  if (ch.target == TargetType::kCUDA) {
+#ifdef LITE_WITH_CUDA
+    dispatch_t.reset(new DeviceTimer<TargetType::kCUDA>());
+#else
+    LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the "
+                  "default x86 timer is used instead.";
+#endif
+  } else {
+    dispatch_t.reset(new DeviceTimer<TargetType::kHost>());
+  }
+}
+
+lite::profile::Timer* StatisUnit::Timer(Type type) {
+  if (type == Type::kCreate) {
+    return create_t.get();
+  } else if (type == Type::kDispatch) {
+    return dispatch_t.get();
+  }
+  LOG(FATAL) << "Timer cannot be returned for unknown platforms.";
+  return nullptr;
+}
+
+int Profiler::NewTimer(const OpCharacter& ch) {
+  StatisUnit unit(ch);
+  units_.push_back(std::move(unit));
+  return units_.size() - 1;
+}
+
+void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) {
+  CHECK_LT(index, units_.size())
+      << "The timer index in the profiler is out of range.";
+  units_[index].Timer(type)->Start(ctx);
+}
+
+float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) {
+  CHECK_LT(index, units_.size())
+      << "The timer index in the profiler is out of range.";
+  return units_[index].Timer(type)->Stop(ctx);
+}
+
+std::string Profiler::Summary(Type type, bool concise, size_t w) {
+  using std::setw;
+  using std::left;
+  using std::fixed;
+  STL::stringstream ss;
+  std::string title;
+  // Title.
+  if (concise) {
+    ss << "Timing cycle = " << units_.front().Timer(type)->LapTimes().Size()
+       << std::endl;
+    ss << "===== Concise " << TypeStr.find(type)->second
+       << " Profiler Summary: " << name_ << ", Exclude " << w
+       << " warm-ups =====" << std::endl;
+  } else {
+    ss << "===== Detailed " << TypeStr.find(type)->second
+       << " Profiler Summary: " << name_ << ", Exclude " << w
+       << " warm-ups =====" << std::endl;
+  }
+  ss << setw(25) << left << "Operator Type"
+     << " " << setw(40) << left << "Kernel Name"
+     << " " << setw(12) << left << "Remark"
+     << " " << setw(12) << left << "Avg (ms)"
+     << " " << setw(12) << left << "Min (ms)"
+     << " " << setw(12) << left << "Max (ms)"
+     << " " << setw(12) << left << "Last (ms)" << std::endl;
+  // Profile information.
+  if (concise) {
+    std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
+    for (auto& unit : units_) {
+      auto ch = summary.find(unit.Character());
+      if (ch != summary.end()) {
+        ch->second.avg += unit.Timer(type)->LapTimes().Avg(w);
+        ch->second.min += unit.Timer(type)->LapTimes().Min(w);
+        ch->second.max += unit.Timer(type)->LapTimes().Max(w);
+      } else {
+        TimeInfo info({unit.Timer(type)->LapTimes().Avg(w),
+                       unit.Timer(type)->LapTimes().Min(w),
+                       unit.Timer(type)->LapTimes().Max(w)});
+        summary.insert({unit.Character(), info});
+      }
+    }
+    for (const auto& item : summary) {
+      // clang-format off
+      ss << setw(25) << left << fixed << item.first.op_type             \
+         << " " << setw(40) << left << fixed << item.first.kernel_name  \
+         << " " << setw(12) << left << fixed << item.first.remark       \
+         << " " << setw(12) << left << fixed << item.second.avg         \
+         << " " << setw(12) << left << fixed << item.second.min         \
+         << " " << setw(12) << left << fixed << item.second.max         \
+         << " " << std::endl;
+      // clang-format on
+    }
+  } else {
+    for (auto& unit : units_) {
+      const auto& times = unit.Timer(type)->LapTimes();
+      // clang-format off
+      ss << setw(25) << left << fixed << unit.Character().op_type            \
+         << " " << setw(40) << left << fixed << unit.Character().kernel_name \
+         << " " << setw(12) << left << fixed << unit.Character().remark      \
+         << " " << setw(12) << left << fixed << times.Avg(w)                 \
+         << " " << setw(12) << left << fixed << times.Min(w)                 \
+         << " " << setw(12) << left << fixed << times.Max(w)                 \
+         << " " << setw(12) << left << fixed << times.Last(w)                \
+         << std::endl;
+      // clang-format on
+    }
+  }
+  return ss.str();
+}
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..3933e5ba01ebcb20420494a955cbc0e202879f76
--- /dev/null
+++ b/lite/core/profile/profiler.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/profile/timer.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+enum class Type {
+  kUnk = 0,
+  kCreate,
+  kDispatch,
+};
+
+extern std::map<Type, std::string> TypeStr;
+
+struct TimeInfo {
+  float avg;
+  float min;
+  float max;
+};
+
+struct OpCharacter {
+  TargetType target;
+  std::string op_type{std::string("N/A")};
+  std::string kernel_name{std::string("N/A")};
+  std::string remark{std::string("N/A")};
+};
+
+class StatisUnit final {
+ public:
+  explicit StatisUnit(const OpCharacter& ch);
+  lite::profile::Timer* Timer(Type type);
+  const OpCharacter& Character() const { return character; }
+
+ protected:
+  std::unique_ptr<lite::profile::Timer> create_t;
+  std::unique_ptr<lite::profile::Timer> dispatch_t;
+  OpCharacter character;
+};
+
+class Profiler final {
+ public:
+  Profiler() = default;
+  explicit Profiler(const std::string& name) : name_(name) {}
+  int NewTimer(const OpCharacter& ch);
+  void StartTiming(Type type, const int index, KernelContext* ctx);
+  float StopTiming(Type type, const int index, KernelContext* ctx);
+  std::string Summary(Type type, bool concise = true, size_t warm_up = 10);
+
+ private:
+  std::string name_{std::string("N/A")};
+  std::vector<StatisUnit> units_;
+};
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/test_timer.cc b/lite/core/profile/test_timer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3841f0151890d377a87f4f5d4b6d069ee75b560e
--- /dev/null
+++ b/lite/core/profile/test_timer.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
+#include "lite/core/context.h"
+#include "lite/core/profile/profiler.h"
+#include "lite/core/profile/timer.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+TEST(timer, real_latency) {
+  Timer timer;
+
+  timer.Start();
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  timer.Stop();
+
+  timer.Start();
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  timer.Stop();
+
+  LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg();
+}
+
+#ifdef LITE_WITH_CUDA
+TEST(gpu_timer, real_latency) {
+  DeviceTimer<TargetType::kCUDA> timer;
+  KernelContext ctx;
+  cudaStream_t exec_stream;
+  cudaStreamCreate(&exec_stream);
+  (&ctx.As<CUDAContext>())->SetExecStream(exec_stream);
+
+  timer.Start(&ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  timer.Stop(&ctx);
+
+  (&timer)->Start(&ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  timer.Stop(&ctx);
+
+  LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg();
+}
+
+TEST(profiler, real_latency) {
+  KernelContext ctx;
+  cudaStream_t exec_stream;
+  cudaStreamCreate(&exec_stream);
+  (&ctx.As<CUDAContext>())->SetExecStream(exec_stream);
+
+  Profiler profiler("name");
+  profile::OpCharacter ch;
+  ch.target = TargetType::kCUDA;
+  ch.op_type = "operator/1";
+  ch.kernel_name = "kernel/1";
+  int idx = profiler.NewTimer(ch);
+  profiler.StartTiming(Type::kDispatch, idx, &ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  profiler.StopTiming(Type::kDispatch, idx, &ctx);
+  std::cout << profiler.Summary(Type::kDispatch);
+}
+#endif
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/timer.h b/lite/core/profile/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9bb16bd27d5ec6fd21814c35db52b2467a12b51
--- /dev/null
+++ b/lite/core/profile/timer.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <chrono>  // NOLINT
+#include <vector>
+#ifdef LITE_WITH_CUDA
+#include "lite/backends/cuda/cuda_utils.h"
+#endif
+#include "lite/core/context.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+template <typename T>
+class TimeList {
+ public:
+  void Clear() { laps_t_.clear(); }
+  void Add(T t) { laps_t_.push_back(t); }
+  T Last(size_t offset = 0) const {
+    if (!Size(offset)) {
+      return 0;
+    }
+    return laps_t_.back();
+  }
+  T Max(size_t offset = 0) const {
+    if (!Size(offset)) {
+      return 0;
+    }
+    return *std::max_element((laps_t_.begin() + offset), laps_t_.end());
+  }
+  T Min(size_t offset = 0) const {
+    if (!Size(offset)) {
+      return 0;
+    }
+    return *std::min_element((laps_t_.begin() + offset), laps_t_.end());
+  }
+  T Sum(size_t offset = 0) const {
+    if (!Size(offset)) {
+      return 0;
+    }
+    return std::accumulate((laps_t_.begin() + offset), laps_t_.end(), 0.0);
+  }
+  size_t Size(size_t offset = 0) const {
+    size_t size = (laps_t_.size() <= offset) ? 0 : (laps_t_.size() - offset);
+    return size;
+  }
+  T Avg(size_t offset = 0) const {
+    if (!Size(offset)) {
+      return 0;
+    }
+    return Sum(offset) / Size(offset);
+  }
+  const std::vector<T>& Raw() const { return laps_t_; }
+
+ private:
+  std::vector<T> laps_t_;
+};
+
+class Timer {
+ public:
+  Timer() = default;
+  virtual ~Timer() = default;
+
+  void Reset() { laps_t_.Clear(); }
+  void Start() { t_start_ = std::chrono::system_clock::now(); }
+  float Stop() {
+    t_stop_ = std::chrono::system_clock::now();
+    auto ts = std::chrono::duration_cast<std::chrono::microseconds>(t_stop_ -
+                                                                    t_start_);
+    float elapse_ms = 1000.f * static_cast<float>(ts.count()) *
+                      std::chrono::microseconds::period::num /
+                      std::chrono::microseconds::period::den;
+    this->laps_t_.Add(elapse_ms);
+    return elapse_ms;
+  }
+  virtual void Start(KernelContext* ctx) { return Start(); }
+  virtual float Stop(KernelContext* ctx) { return Stop(); }
+  float AvgLapTimeMs() const { return laps_t_.Avg(); }
+  const TimeList<float>& LapTimes() const { return laps_t_; }
+
+ protected:
+  TimeList<float> laps_t_;
+
+ private:
+  std::chrono::time_point<std::chrono::system_clock> t_start_, t_stop_;
+};
+
+template <TargetType Target>
+class DeviceTimer final : public Timer {};
+
+#ifdef LITE_WITH_CUDA
+template <>
+class DeviceTimer<TargetType::kCUDA> final : public Timer {
+ public:
+  DeviceTimer() {
+    CUDA_CALL(cudaEventCreate(&e_start_));
+    CUDA_CALL(cudaEventCreate(&e_stop_));
+  }
+  ~DeviceTimer() {
+    CUDA_CALL(cudaEventDestroy(e_start_));
+    CUDA_CALL(cudaEventDestroy(e_stop_));
+  }
+  void Start(KernelContext* ctx) {
+    cudaStream_t stream;
+    stream = ctx->As<CUDAContext>().exec_stream();
+    CUDA_CALL(cudaEventRecord(e_start_, stream));
+  }
+  float Stop(KernelContext* ctx) {
+    cudaStream_t stream;
+    stream = ctx->As<CUDAContext>().exec_stream();
+    CUDA_CALL(cudaEventRecord(e_stop_, stream));
+    CUDA_CALL(cudaEventSynchronize(e_stop_));
+    float elapse_ms = 1.f;
+    CUDA_CALL(cudaEventElapsedTime(&elapse_ms, e_start_, e_stop_));
+    this->laps_t_.Add(elapse_ms);
+    return elapse_ms;
+  }
+
+ private:
+  cudaEvent_t e_start_, e_stop_;
+};
+#endif
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/program.cc b/lite/core/program.cc
index b60f279c0fc74904477a080579a799f601e359b0..0895643a6adde0095f9d2892c41f263eedd4284f 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -17,6 +17,8 @@
 #include "lite/model_parser/cpp/block_desc.h"
 #include "lite/model_parser/cpp/op_desc.h"
 #include "lite/model_parser/cpp/var_desc.h"
+#include "lite/operators/conditional_block_op.h"
+#include "lite/operators/subgraph_op.h"
 #include "lite/operators/while_op.h"
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/precision_profiler.h"
@@ -30,10 +32,32 @@ void RuntimeProgram::SaveOpInfosToProgram(cpp::ProgramDesc* desc) {
   // NOTE: RuntimeProgram do not has all meta info, so save model just update
   // upon origin model
   CHECK(desc->BlocksSize());
-  auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
-  main_block.ClearOps();
+  auto main_block = desc->GetBlock<cpp::BlockDesc>(0);
+  main_block->ClearOps();
   for (auto& node : instructions_) {
-    auto* op = main_block.AddOp<cpp::OpDesc>();
+    auto op_type = node.op()->op_info()->Type();
+    if (op_type == "subgraph") {
+      auto subgraph_op = const_cast<operators::SubgraphOp*>(
+          static_cast<const operators::SubgraphOp*>(node.op()));
+      int sub_block_idx = subgraph_op->op_info()->GetAttr<int32_t>("sub_block");
+      if (sub_block_idx < 0) {
+        // It's a new subgraph op when its sub_block_idx < 0, Now we add its
+        // subblock desc to the program desc, Then update its sub_block_idx to
+        // the index of block desc of the program desc.
+        sub_block_idx = desc->BlocksSize();
+        auto sub_block_desc = subgraph_op->GetSubBlock();
+        CHECK(sub_block_desc);
+        auto new_block_desc = desc->AddBlock<cpp::BlockDesc>();
+        *new_block_desc = *sub_block_desc;
+        delete sub_block_desc;
+        subgraph_op->mutable_op_info()->SetAttr<int32_t>("sub_block",
+                                                         sub_block_idx);
+        subgraph_op->SetSubBlock(new_block_desc);
+        // Update main block desc after a new subblock desc is added
+        main_block = desc->GetBlock<cpp::BlockDesc>(0);
+      }
+    }
+    auto op = main_block->AddOp<cpp::OpDesc>();
     *op = *node.op()->op_info();
     op->SetAttr(kKernelTypeAttr, node.kernel()->SerializedKernelType());
   }
@@ -113,15 +137,21 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 
 void RuntimeProgram::Run() {
   for (auto& inst : instructions_) {
-    std::string op_type = inst.op()->op_info()->Type();
-    if (op_type == "feed" || op_type == "fetch") continue;
+#ifndef LITE_WITH_FPGA
+    if (inst.is_feed_fetch_op()) continue;
+#endif
     inst.Run();
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
+#ifndef LITE_WITH_FPGA
     LITE_PRECISION_PROFILE(inst)
+#endif
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
   }
+#ifdef LITE_WITH_PROFILE
+  LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
+#endif  // LITE_WITH_PROFILE
 }
 
 void Program::Build(const cpp::ProgramDesc& prog) {
@@ -138,12 +168,26 @@ void Program::Build(const cpp::ProgramDesc& prog) {
     VLOG(4) << "create Op [" << op_type << "]";
     auto op = LiteOpRegistry::Global().Create(op_type);
     CHECK(op) << "no Op found for " << op_type;
-    if (op_type == "while") {
+    if (op_type == "while" || op_type == "conditional_block" ||
+        op_type == "subgraph") {
       auto sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
-      auto sub_block =
+      CHECK(sub_block_idx >= 0 && sub_block_idx < program.BlocksSize())
+          << "Invalid attribute sub_block(" << sub_block_idx << ") for "
+          << op_type;
+      auto sub_block_desc =
           const_cast<cpp::ProgramDesc&>(prog).GetBlock<cpp::BlockDesc>(
               sub_block_idx);
-      static_cast<operators::WhileOpLite*>(op.get())->SetSubBlock(sub_block);
+      CHECK(sub_block_desc);
+      if (op_type == "while") {
+        static_cast<operators::WhileOpLite*>(op.get())->SetSubBlock(
+            sub_block_desc);
+      } else if (op_type == "conditional_block") {
+        static_cast<operators::ConditionalBlockOpLite*>(op.get())->SetSubBlock(
+            sub_block_desc);
+      } else if (op_type == "subgraph") {
+        static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(
+            sub_block_desc);
+      }
     }
     ops_.emplace_back(std::move(op));
     ops_.back()->Attach(op_desc, exec_scope_);
@@ -159,6 +203,27 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
   tmp_vars_.push_back("feed");
   tmp_vars_.push_back("fetch");
 
+  auto VarPrecision2KernlPrecision =
+      [](const lite::VarDescAPI::Type& type) -> PrecisionType {
+    switch (type) {
+      case lite::VarDescAPI::Type::FP32:
+        return PRECISION(kFloat);
+      case lite::VarDescAPI::Type::FP16:
+        return PRECISION(kFP16);
+      case lite::VarDescAPI::Type::INT8:
+        return PRECISION(kInt8);
+      case lite::VarDescAPI::Type::INT16:
+        return PRECISION(kInt16);
+      case lite::VarDescAPI::Type::INT32:
+        return PRECISION(kInt32);
+      case lite::VarDescAPI::Type::INT64:
+        return PRECISION(kInt64);
+      default:
+        // LOG(FATAL) << "not supported type: " << static_cast<int>(type);
+        return PRECISION(kUnk);
+    }
+  };
+
   auto program = prog;
   CHECK(program.BlocksSize());
   for (size_t b = 0; b < program.BlocksSize(); ++b) {
@@ -166,7 +231,16 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
     for (size_t i = 0; i < main_block.VarsSize(); ++i) {
       auto& var_desc = *main_block.GetVar<cpp::VarDesc>(i);
       if (!var_desc.Persistable()) {
+        if (var_desc.GetType() == lite::VarDescAPI::Type::LOD_TENSOR &&
+            VarPrecision2KernlPrecision(var_desc.GetDataType()) !=
+                PRECISION(kUnk)) {
+          var_data_type_[var_desc.Name()] =
+              VarPrecision2KernlPrecision(var_desc.GetDataType());
+        }
         tmp_vars_.push_back(var_desc.Name());
+        VLOG(4) << "var name: " << var_desc.Name() << " type is "
+                << static_cast<int>(var_desc.GetType()) << " data type is "
+                << static_cast<int>(var_desc.GetDataType());
         exec_scope_->Var(var_desc.Name());
         if (b > 0) {
           VLOG(4) << "var: " << var_desc.Name();
@@ -181,13 +255,16 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
 }
 
 void Instruction::Run() {
+#ifdef LITE_WITH_PROFILE
+  CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. "
+                      "When LITE_WITH_PROFILE is defined, please set a "
+                      "Profiler for Instruction.";
+  profiler_->StartTiming(
+      profile::Type::kCreate, profile_id_, kernel_->mutable_context());
+#endif
   CHECK(op_) << "op null";
   CHECK(kernel_) << "kernel null";
-#ifdef LITE_WITH_PROFILE
-  if (profile_id_ >= 0) {
-    profile::ProfileBlock x(profile_id_, "instruction");
-  }
-#endif  // LITE_WITH_PROFILE
+
   if (first_epoch_) {
     first_epoch_ = false;
     CHECK(op_->CheckShape());
@@ -196,14 +273,8 @@ void Instruction::Run() {
   if (op_->run_once() && has_run_) {
     return;
   }
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel launch";
-#endif
+
   op_->InferShape();
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
-          << TargetToStr(kernel_->target());
-#endif
   kernel_->Launch();
   has_run_ = true;
 }
diff --git a/lite/core/program.h b/lite/core/program.h
index 7a6700da61f7ba9f35491613d7733b4b637b8ff0..c845a17c52c0c565e339a13e093f3e8f59e8d4a7 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -16,15 +16,13 @@
 #include <list>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/model_parser/cpp/program_desc.h"
-#ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
-#endif  // LITE_WITH_PROFILE
 
 namespace paddle {
 namespace lite {
@@ -66,6 +64,10 @@ struct Program {
   lite::Scope* exec_scope() { return exec_scope_; }
   lite::Scope* scope() { return scope_.get(); }
 
+  const std::unordered_map<std::string, PrecisionType>& var_data_type() const {
+    return var_data_type_;
+  }
+
  private:
   // Build from a program and scope.
   void Build(const cpp::ProgramDesc& program);
@@ -73,6 +75,7 @@ struct Program {
   void PrepareWorkspace(const cpp::ProgramDesc& program);
 
  private:
+  std::unordered_map<std::string, PrecisionType> var_data_type_;
   std::list<std::string> tmp_vars_;
   std::list<std::string> weights_;
   std::list<std::shared_ptr<OpLite>> ops_;
@@ -88,20 +91,10 @@ struct Instruction {
   Instruction(const std::shared_ptr<OpLite>& op,
               std::unique_ptr<KernelBase>&& kernel)
       : op_(op), kernel_(std::move(kernel)) {
-#ifdef LITE_WITH_PROFILE
-    if (op_->Type() != "feed" && op_->Type() != "fetch") {
-      profile_id_ = profile::BasicProfiler<profile::BasicTimer>::Global()
-                        .NewRcd(kernel_->SerializedKernelType())
-                        .id();
-      kernel_->SetProfileID(profile_id_);
-      // Set profile custom info
-      auto& profiler =
-          *profile::BasicProfiler<profile::BasicTimer>::Global().mutable_record(
-              profile_id_);
-      profiler.SetCustomInfo("op_type", op_->Type());
-      profiler.SetCustomInfo("op_info", op_->SerializedOpInfo());
+    std::string op_type = op->Type();
+    if (op_type == "feed" || op_type == "fetch") {
+      is_feed_fetch_op_ = true;
     }
-#endif  // LITE_WITH_PROFILE
   }
 
   // Run the instruction.
@@ -113,14 +106,31 @@ struct Instruction {
   const KernelBase* kernel() const { return kernel_.get(); }
   KernelBase* mutable_kernel() { return kernel_.get(); }
 
+  bool is_feed_fetch_op() const { return is_feed_fetch_op_; }
+
+#ifdef LITE_WITH_PROFILE
+  void set_profiler(profile::Profiler* profiler) {
+    profiler_ = profiler;
+    if (op_->Type() != "feed" && op_->Type() != "fetch") {
+      profile::OpCharacter ch;
+      ch.target = kernel()->target();
+      ch.op_type = op_->Type();
+      ch.kernel_name = kernel()->name();
+      profile_id_ = profiler->NewTimer(ch);
+      kernel_->SetProfiler(profiler_, profile_id_);
+    }
+  }
+#endif
+
  private:
   std::shared_ptr<OpLite> op_;
   std::unique_ptr<KernelBase> kernel_;
+  bool is_feed_fetch_op_{false};
   bool first_epoch_{true};
   bool has_run_{false};
 
 #ifdef LITE_WITH_PROFILE
-  // for profiler
+  profile::Profiler* profiler_;
   int profile_id_{-1};
 #endif  // LITE_WITH_PROFILE
 };
@@ -135,6 +145,15 @@ class LITE_API RuntimeProgram {
     if (instructions_.empty()) {
       LOG(FATAL) << "no instructions";
     }
+#ifdef LITE_WITH_PROFILE
+    set_profiler();
+#endif
+  }
+  ~RuntimeProgram() {
+#ifdef LITE_WITH_PROFILE
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
+#endif  // LITE_WITH_PROFILE
   }
 
   void Run();
@@ -159,6 +178,15 @@ class LITE_API RuntimeProgram {
   RuntimeProgram(const RuntimeProgram&) = delete;
   std::vector<Instruction> instructions_;
   lite::Scope* exec_scope_{};
+
+#ifdef LITE_WITH_PROFILE
+  profile::Profiler profiler_;
+  void set_profiler() {
+    for (auto i = instructions_.begin(); i != instructions_.end(); ++i) {
+      i->set_profiler(&profiler_);
+    }
+  }
+#endif
 };
 
 }  // namespace lite
diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc
index 1c7db871c7b525d6e4944fd0d669e81bcaff7f2a..38a6be6767eae62f9d91c9c11811bc49639331bf 100644
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -25,21 +25,17 @@ using value_type = int64_t;
 
 value_type DDimLite::production() const {
   value_type res = 1;
-  for (size_t i = 0; i < this->size(); i++) {
-    res *= (*this)[i];
+  for (size_t i = 0; i < data_.size(); i++) {
+    res *= data_[i];
   }
   return res;
 }
 
 value_type DDimLite::count(int start, int end) const {
-  if (start < 0) {
-    start = 0;
-  }
-  if (end > size()) {
-    end = size();
-  }
+  start = std::max(start, 0);
+  end = std::min(end, static_cast<int>(data_.size()));
   if (end < start) {
-    end = start;
+    return 0;
   }
   value_type sum = 1;
   for (auto i = start; i < end; ++i) {
@@ -49,11 +45,13 @@ value_type DDimLite::count(int start, int end) const {
 }
 
 DDimLite DDimLite::Slice(int start, int end) const {
-  std::vector<value_type> vec;
+  start = std::max(start, 0);
+  end = std::min(end, static_cast<int>(data_.size()));
+  std::vector<value_type> new_dim(end - start);
   for (int i = start; i < end; i++) {
-    vec.push_back((*this)[i]);
+    new_dim[i - start] = data_[i];
   }
-  return DDimLite(vec);
+  return DDim(new_dim);
 }
 
 std::string DDimLite::repr() const {
@@ -104,6 +102,12 @@ const cl::Image2D *TensorLite::data<float, cl::Image2D>() const {
   if (nullptr == buffer_->data()) return nullptr;
   return static_cast<const cl::Image2D *>(buffer_->data());
 }
+
+template <>  // use int16_t represent half float
+const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const {
+  if (nullptr == buffer_->data()) return nullptr;
+  return static_cast<const cl::Image2D *>(buffer_->data());
+}
 #endif
 
 }  // namespace lite
diff --git a/lite/core/tensor.h b/lite/core/tensor.h
index 8c4fe1604a517332e52b243404828e81af26f419..04e540002b553a0e0f7db0144fd970bdb6a4d9ed 100644
--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
@@ -85,7 +85,11 @@ class DDimLite {
   }
 
   friend bool operator!=(const DDimLite &a, const DDimLite &b) {
-    return !(a == b);
+    if (a.size() != b.size()) return true;
+    for (size_t i = 0; i < a.size(); i++) {
+      if (a[i] != b[i]) return true;
+    }
+    return false;
   }
 
  private:
@@ -118,7 +122,7 @@ class TensorLite {
   }
 
   void Resize(const DDimLite &ddim) { dims_ = ddim; }
-  void Resize(const std::vector<int64_t> &x) { dims_ = DDimLite(x); }
+  void Resize(const std::vector<int64_t> &x) { dims_.ConstructFrom(x); }
 
   const DDimLite &dims() const { return dims_; }
   int64_t numel() const { return dims_.production(); }
@@ -139,6 +143,7 @@ class TensorLite {
   // For other devices, T and R may be the same type.
   template <typename T, typename R = T>
   R *mutable_data() {
+    precision_ = lite_api::PrecisionTypeTrait<T>::Type();
     memory_size_ = dims_.production() * sizeof(T);
     buffer_->ResetLazy(target_, memory_size_);
     return reinterpret_cast<R *>(static_cast<char *>(buffer_->data()) +
@@ -147,9 +152,11 @@ class TensorLite {
 
 #ifdef LITE_WITH_OPENCL
   template <typename T, typename R = T>
-  R *mutable_data(const size_t img_w, const size_t img_h) {
+  R *mutable_data(const size_t img_w,
+                  const size_t img_h,
+                  void *host_ptr = nullptr) {
     target_ = TARGET(kOpenCL);
-    buffer_->ResetLazyImage2D<T>(target_, img_w, img_h);
+    buffer_->ResetLazyImage2D<T>(target_, img_w, img_h, host_ptr);
     return static_cast<cl::Image2D *>(buffer_->data());
   }
 #endif
@@ -161,10 +168,7 @@ class TensorLite {
   template <typename T, typename R = T>
   R *mutable_data(TargetType target) {
     target_ = target;
-    memory_size_ = dims_.production() * sizeof(T);
-    buffer_->ResetLazy(target, memory_size());
-    return reinterpret_cast<R *>(static_cast<char *>(buffer_->data()) +
-                                 offset_);
+    return mutable_data<T, R>();
   }
   void *mutable_data(size_t memory_size);
   void *mutable_data(TargetType target, size_t memory_size);
@@ -174,6 +178,10 @@ class TensorLite {
         (static_cast<char *>(buffer_->data()) + offset_));
   }
 
+  void clear() {
+    buffer_->Free();
+    offset_ = 0;
+  }
   size_t data_size() const { return this->dims().production(); }
 
   size_t memory_size() const { return memory_size_; }
@@ -251,6 +259,9 @@ bool TensorCompareWith(const TensorT &a, const TensorT &b) {
 #ifdef LITE_WITH_OPENCL
 template <>
 const cl::Image2D *TensorLite::data<float, cl::Image2D>() const;
+
+template <>  // use int16_t represent half float
+const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const;
 #endif
 
 }  // namespace lite
diff --git a/lite/core/version.h.in b/lite/core/version.h.in
index 3082adc5abecb20f5ce19032177fc7cdb75299ff..d34c32073b852a50b5d26984ed4812ac4f38a870 100644
--- a/lite/core/version.h.in
+++ b/lite/core/version.h.in
@@ -42,7 +42,7 @@ static std::string version() {
 
   std::string tag = paddlelite_tag();
   if (tag.empty()) {
-    ss << paddlelite_branch() << "(" << paddlelite_commit() << ")";
+    ss << paddlelite_commit();
   } else {
     ss << tag;
   }
diff --git a/lite/demo/cxx/Makefile.def b/lite/demo/cxx/Makefile.def
index 1b5da970e8fa9b2793f7a4982d5ed22ed21e79fd..800331035323735c01b04940e70fd034ede51c84 100644
--- a/lite/demo/cxx/Makefile.def
+++ b/lite/demo/cxx/Makefile.def
@@ -1,35 +1,43 @@
-CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \
-	      -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING
-LDFLAGS = -latomic -pthread -ldl
+# get the name of current operation system: Linux or Darwin
+SYSTEM=$(shell "uname")
 
-SYSROOT_COMPLILE = --sysroot=/opt/android-ndk-r17c/sysroot
+CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \
+              -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING
+LDFLAGS = -latomic -pthread -ldl -llog -lz
 
-THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a
+SYSROOT_COMPLILE = --sysroot=$(NDK_ROOT)/sysroot
 
-SYSTEM_INCLUDES = -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/include \
-	          -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++abi/include \
-	          -I/opt/android-ndk-r17c/sources/android/support/include \
-	          -I/opt/android-ndk-r17c/sysroot/usr/include \
+SYSTEM_INCLUDES = -I$(NDK_ROOT)/sources/cxx-stl/llvm-libc++/include \
+                  -I$(NDK_ROOT)/sources/cxx-stl/llvm-libc++abi/include \
+                  -I$(NDK_ROOT)/sources/android/support/include \
+                  -I$(NDK_ROOT)/sysroot/usr/include \
 
-THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include
 
 ifeq ($(ARM_ABI), arm8)
-    CC = /opt/android-ndk-r17c/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++ 
-    CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti  -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
-    CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections 
-    SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-24/arch-arm64
-    SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a
-    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android $(THIRD_PARTY_INCLUDES)
+    ifeq ($(SYSTEM), Linux)
+        CC = $(NDK_ROOT)/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++
+    else
+        CC = $(NDK_ROOT)/toolchains/aarch64-linux-android-4.9/prebuilt/darwin-x86_64/bin/aarch64-linux-android-g++
+    endif
+    CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
+    CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections
+    SYSROOT_LINK = --sysroot=$(NDK_ROOT)/platforms/android-24/arch-arm64
+    SYSTEM_LIBS = $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \
+                  $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a
+    INCLUDES = $(SYSTEM_INCLUDES) -I$(NDK_ROOT)/sysroot/usr/include/aarch64-linux-android
 else
-    CC = /opt/android-ndk-r17c/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++
+    ifeq ($(SYSTEM), Linux)
+        CC = $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++
+    else
+        CC = $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-g++
+    endif
     CXX_FLAGS = -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp -funwind-tables -no-canonical-prefixes \
-		-D__ANDROID_API__=23 -fexceptions -frtti  -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
+               -D__ANDROID_API__=23 -fexceptions -frtti  -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
     CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--fix-cortex-a8 -Wl,--gc-sections -Wl,-z,nocopyreloc
-    SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-23/arch-arm
-    SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_static.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a
-    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi $(THIRD_PARTY_INCLUDES)
+    SYSROOT_LINK = --sysroot=$(NDK_ROOT)/platforms/android-23/arch-arm
+    SYSTEM_LIBS = $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_static.a \
+                  $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \
+                  $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \
+                  $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a
+    INCLUDES = $(SYSTEM_INCLUDES) -I$(NDK_ROOT)/sysroot/usr/include/arm-linux-androideabi
 endif
diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md
index ec72c044e3fd08bd775b23c373945c5bb5743d1d..447bcbaff018d15a1bc3075c1153f724672f40a8 100644
--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
@@ -1,42 +1,181 @@
 # C++ Demo
-1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像
-2. 运行并进入docker镜像环境，执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv7.tar.gz` 进行下载)。
-3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz `
-4. 执行以下命令准备模拟器环境
+
+> 欢迎加入PaddleLite百度官方QQ群（696965088），会有专业同学解答您的疑问与困惑。
+
+1. 环境准备
+   - 一台可以编译PaddleLite的电脑
+   - 一台armv7或armv8架构的安卓手机
+
+2. 人脸识别和佩戴口罩判断的Demo
+
+参考[源码编译](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/source_compile/)准备编译环境。
+
+执行下面命令，下载PaddleLite代码。
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+```
+
+进入PaddleLite根目录，编译预测库。
 ```shell
-# armv8
-adb kill-server
-adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
-echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 &
-sleep 1m
+./lite/tools/build.sh \
+    --arm_os=android \
+    --arm_abi=armv8 \
+    --arm_lang=gcc \
+    --android_stl=c++_static \
+    --build_extra=ON \
+    --shutdown_log=OFF \
+    tiny_publish
 ```
+
+进入编译目录，下载模型和图片的压缩包，编译可执行文件。
 ```shell
-# armv7
-adb kill-server
-adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
-echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 &
-sleep 1m
+cd build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mask_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mask_detection.tar.gz
+tar zxvf mask_detection.tar.gz
+make
+```
+
+当然，大家也可以通过PaddleHub下载人脸检测模型和口罩佩戴判断模型。
 ```
-5. 准备模型、编译并运行完整api的demo
+# 下载paddlehub以后，通过python执行以下代码
+import paddlehub as hub
+pyramidbox_lite_mobile_mask = hub.Module(name="pyramidbox_lite_mobile_mask")
+# 将模型保存在test_program文件夹之中
+pyramidbox_lite_mobile_mask.processor.save_inference_model(dirname="test_program") 
+# 通过以上命令，可以获得人脸检测和口罩佩戴判断模型，分别存储在pyramidbox_lite和mask_detector之中。文件夹中的__model__是模型结构文件，__param__文件是权重文件。
+# 从PaddleHub下载的是预测模型，需要使用PaddleLite提供的model_optimize_tools对预测模型进行转换，请参考[模型转换文档](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/model_optimize_tool/)。
+```
+
+电脑连接安卓手机，将可执行文件、测试图片、模型文件、预测库push到安卓手机上。
+```
+adb push mask_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push face_detection /data/local/tmp
+adb push mask_classification /data/local/tmp
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mask_detection
+```
+
+进入安卓手机，执行demo。
+```
+adb shell
+cd /data/local/tmp
+export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH 
+./mask_detection face_detection mask_classification test.jpg
+```
+
+回到电脑端，将结果取出，查看如下效果图。
+```
+adb pull /data/local/tmp/test_mask_detection_result.jpg ./
+```
+
+![test_mask_detection_result](https://user-images.githubusercontent.com/7383104/74279176-6200cd00-4d55-11ea-9fc0-83cfc2b3b37d.jpg)
+
+3. 编译并运行全量api的demo(注：当编译模式为tiny_pubish时将不存在该demo)
 ```shell
 cd inference_lite_lib.android.armv8/demo/cxx/mobile_full
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
 tar zxvf mobilenet_v1.tar.gz
 make
-adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/
-adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api
-adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
+adb push mobilenet_v1 /data/local/tmp/
+adb push mobilenetv1_full_api /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobilenetv1_full_api
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
 ```
 运行成功将在控制台输出预测结果的前10个类别的预测概率
 
-6. 编译并运行轻量级api的demo
+4. 编译并运行轻量级api的demo
 ```shell
 cd ../mobile_light
 make
-adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api
-adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt"
+adb push mobilenetv1_light_api /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobilenetv1_light_api /data/local/tmp/mobilenet_v1.opt"
+```
+运行成功将在控制台输出预测结果的前10个类别的预测概率
+
+5. 编译并运行ssd目标检测的demo
+```shell
+cd ../ssd_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
+tar zxvf mobilenetv1-ssd.tar.gz
+make
+adb push ssd_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push mobilenetv1-ssd /data/local/tmp
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/ssd_detection
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/ssd_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
+adb pull /data/local/tmp/test_ssd_detection_result.jpg ./
 ```
+运行成功将在ssd_detection目录下看到生成的目标检测结果图像: test_ssd_detection_result.jpg
+
+6. 编译并运行yolov3目标检测的demo
+```shell
+cd ../yolov3_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-yolov3.tar.gz
+tar zxvf mobilenetv1-yolov3.tar.gz
+make
+adb push yolov3_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push mobilenetv1-yolov3 /data/local/tmp
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/yolov3_detection
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/yolov3_detection /data/local/tmp/mobilenetv1-yolov3 /data/local/tmp/test.jpg"
+adb pull /data/local/tmp/test_yolov3_detection_result.jpg ./
+```
+运行成功将在yolov3_detection目录下看到生成的目标检测结果图像: test_yolov3_detection_result.jpg
+
+7. 编译并运行物体分类的demo
+```shell
+cd ../mobile_classify
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+./model_optimize_tool optimize model
+make
+
+adb push mobile_classify /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push labels.txt /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobile_classify
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+```
+运行成功将在控制台输出预测结果的前5个类别的预测概率
+- 如若想看前10个类别的预测概率，在运行命令输入topk的值即可
+    eg:
+    ```shell
+    adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
+    ```
+- 如若想看其他模型的分类结果， 在运行命令输入model_dir 及其model的输入大小即可
+    eg:
+    ```shell
+    adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv2opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
+    ```
+    
+8. 编译含CV预处理库模型单测demo 
+```shell
+cd ../test_cv
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+./model_optimize_tool optimize model
+make
+adb push test_model_cv /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push labels.txt /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/test_model_cv
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/test_model_cv /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+```
+运行成功将在控制台输出预测结果的前10个类别的预测概率
diff --git a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7
new file mode 100644
index 0000000000000000000000000000000000000000..dd6d4b0960160e140e2f051b78814d2fee08d5e0
--- /dev/null
+++ b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7
@@ -0,0 +1,61 @@
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mask_detection: fetch_opencv mask_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mask_detection.o -o mask_detection  $(CXX_LIBS) $(LDFLAGS)
+
+mask_detection.o: mask_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mask_detection.o -c mask_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mask_detection.o
+	rm -f mask_detection
diff --git a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8
new file mode 100644
index 0000000000000000000000000000000000000000..c2f601ed2f68c342b47c5add451f84c537f978de
--- /dev/null
+++ b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8
@@ -0,0 +1,61 @@
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mask_detection: fetch_opencv mask_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mask_detection.o -o mask_detection  $(CXX_LIBS) $(LDFLAGS)
+
+mask_detection.o: mask_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mask_detection.o -c mask_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mask_detection.o
+	rm -f mask_detection
diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7
new file mode 100644
index 0000000000000000000000000000000000000000..8d446af9b174d8876fdd9aafd64bc2057dd7e17e
--- /dev/null
+++ b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7
@@ -0,0 +1,61 @@
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobile_classify: fetch_opencv mobile_classify.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_classify.o -o mobile_classify  $(CXX_LIBS) $(LDFLAGS)
+
+mobile_classify.o: mobile_classify.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_classify.o -c mobile_classify.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mobile_classify.o
+	rm -f mobile_classify
diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8
new file mode 100644
index 0000000000000000000000000000000000000000..255c42f2dca5364d9a639c993737608657568b17
--- /dev/null
+++ b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8
@@ -0,0 +1,61 @@
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobile_classify: fetch_opencv mobile_classify.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_classify.o -o mobile_classify  $(CXX_LIBS) $(LDFLAGS)
+
+mobile_classify.o: mobile_classify.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_classify.o -c mobile_classify.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mobile_classify.o
+	rm -f mobile_classify
diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
index f795b41d46acc3be67ff6c1a0bba0de1c1d8c82d..8ab8a3b7436c836f681510e28461628ed1038709 100644
--- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
@@ -5,9 +5,25 @@ include ../Makefile.def
 
 LITE_ROOT=../../../
 
-CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
+THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
+THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a
+
+CXX_INCLUDES = $(INCLUDES) ${THIRD_PARTY_INCLUDES} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = $(THIRD_PARTY_LIBS) -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_full_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_full_bundled.a`
+
+#CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_full_api: mobilenetv1_full_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
index d0767145b00bd40a3fbeff2aef4f7a0fc6f542d6..c13320603bcce91ebe1fca9014e36b07540abca1 100644
--- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
@@ -5,9 +5,25 @@ include ../Makefile.def
 
 LITE_ROOT=../../../
 
-CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
+THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
+THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a
+
+CXX_INCLUDES = $(INCLUDES) ${THIRD_PARTY_INCLUDES} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = $(THIRD_PARTY_LIBS) -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_full_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_full_bundled.a`
+
+#CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_full_api: mobilenetv1_full_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
index d235d6e25fa9abe47ba50d8336cafcdd6580e30d..9150ae6e44e2314a482f7fcb3d139a20cf9f0304 100644
--- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
@@ -7,7 +7,19 @@ LITE_ROOT=../../../
 
 CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_light_api: mobilenetv1_light_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
index b91aadcef813de2a6f3371fe2cc4989bd87cf1ab..7a2dbdd0fcc9611fe79fb2660ad215ac4ba0d769 100644
--- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
@@ -7,7 +7,19 @@ LITE_ROOT=../../../
 
 CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_light_api: mobilenetv1_light_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7
new file mode 100644
index 0000000000000000000000000000000000000000..05f1c2e276b9cc41cfd4e3f9b4c82790d844ba52
--- /dev/null
+++ b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7
@@ -0,0 +1,61 @@
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+ssd_detection: fetch_opencv ssd_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection  $(CXX_LIBS) $(LDFLAGS)
+
+ssd_detection.o: ssd_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f ssd_detection.o
+	rm -f ssd_detection
diff --git a/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8
new file mode 100644
index 0000000000000000000000000000000000000000..77ff07df9541c554ac5fabf3cf56ee4a8904ea9c
--- /dev/null
+++ b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8
@@ -0,0 +1,61 @@
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+ssd_detection: fetch_opencv ssd_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection  $(CXX_LIBS) $(LDFLAGS)
+
+ssd_detection.o: ssd_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f ssd_detection.o
+	rm -f ssd_detection
diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
new file mode 100644
index 0000000000000000000000000000000000000000..d659a316cd856fd550e83b125573409f239b8cf2
--- /dev/null
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
@@ -0,0 +1,71 @@
+ARM_ABI = arm7
+LITE_WITH_CV = ON
+export ARM_ABI
+export LITE_WITH_CV
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+test_model_cv: fetch_opencv test_model_cv.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+test_model_cv.o: test_model_cv.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+test_img_prepross: fetch_opencv test_img_prepross.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross  $(CXX_LIBS) $(LDFLAGS)
+
+test_img_prepross.o: test_img_prepross.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f test_model_cv.o
+	rm -f test_model_cv
+      rm -f test_img_prepross.o
+	rm -f test_img_prepross
diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
new file mode 100644
index 0000000000000000000000000000000000000000..c80b07d5c029a3624a514e07375fd08e8770da25
--- /dev/null
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
@@ -0,0 +1,70 @@
+ARM_ABI = arm8
+LITE_WITH_CV = ON
+export ARM_ABI
+export LITE_WITH_CV
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+test_model_cv: fetch_opencv test_model_cv.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+test_model_cv.o: test_model_cv.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+test_img_prepross: fetch_opencv test_img_prepross.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross  $(CXX_LIBS) $(LDFLAGS)
+
+test_img_prepross.o: test_img_prepross.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f test_model_cv.o
+	rm -f test_model_cv
+      rm -f test_img_prepross.o
+	rm -f test_img_prepross
diff --git a/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7
new file mode 100644
index 0000000000000000000000000000000000000000..b584f5623594fd64f10a86766828c62cdfe08aef
--- /dev/null
+++ b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7
@@ -0,0 +1,61 @@
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+yolov3_detection: fetch_opencv yolov3_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection  $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_detection.o: yolov3_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f yolov3_detection.o
+	rm -f yolov3_detection
diff --git a/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8
new file mode 100644
index 0000000000000000000000000000000000000000..27779817012bce527d4506a0dcd377bf4ced3c1a
--- /dev/null
+++ b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8
@@ -0,0 +1,61 @@
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+yolov3_detection: fetch_opencv yolov3_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection  $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_detection.o: yolov3_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f yolov3_detection.o
+	rm -f yolov3_detection
diff --git a/lite/demo/cxx/mask_detection/mask_detection.cc b/lite/demo/cxx/mask_detection/mask_detection.cc
new file mode 100644
index 0000000000000000000000000000000000000000..748b84365fc70aa59171a6bf8847f554308fdc8c
--- /dev/null
+++ b/lite/demo/cxx/mask_detection/mask_detection.cc
@@ -0,0 +1,246 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+struct Object {
+  int batch_id;
+  cv::Rect rec;
+  int class_id;
+  float prob;
+};
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(const float* din,
+                     float* dout,
+                     int size,
+                     const std::vector<float> mean,
+                     const std::vector<float> scale) {
+  if (mean.size() != 3 || scale.size() != 3) {
+    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
+    exit(1);
+  }
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+
+void pre_process(const cv::Mat& img,
+                 int width,
+                 int height,
+                 const std::vector<float>& mean,
+                 const std::vector<float>& scale,
+                 float* data,
+                 bool is_scale = false) {
+  cv::Mat resized_img;
+  cv::resize(
+      img, resized_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC);
+  cv::Mat imgf;
+  float scale_factor = is_scale ? 1.f / 256 : 1.f;
+  resized_img.convertTo(imgf, CV_32FC3, scale_factor);
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, data, width * height, mean, scale);
+}
+
+void RunModel(std::string det_model_dir,
+              std::string class_model_dir,
+              std::string img_path) {
+  // Prepare
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  float shrink = 0.2;
+  int width = img.cols;
+  int height = img.rows;
+  int s_width = static_cast<int>(width * shrink);
+  int s_height = static_cast<int>(height * shrink);
+
+  // Detection
+  MobileConfig config;
+  config.set_model_dir(det_model_dir);
+
+  // Create Predictor For Detction Model
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  // Get Input Tensor
+  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
+  input_tensor0->Resize({1, 3, s_height, s_width});
+  auto* data = input_tensor0->mutable_data<float>();
+
+  // Do PreProcess
+  std::vector<float> detect_mean = {104.f, 117.f, 123.f};
+  std::vector<float> detect_scale = {0.007843, 0.007843, 0.007843};
+  pre_process(img, s_width, s_height, detect_mean, detect_scale, data, false);
+
+  // Detection Model Run
+  predictor->Run();
+
+  // Get Output Tensor
+  std::unique_ptr<const Tensor> output_tensor0(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor0->data<float>();
+  auto shape_out = output_tensor0->shape();
+  int64_t out_len = ShapeProduction(shape_out);
+
+  // Filter Out Detection Box
+  float detect_threshold = 0.3;
+  std::vector<Object> detect_result;
+  for (int i = 0; i < out_len / 6; ++i) {
+    if (outptr[1] >= detect_threshold) {
+      Object obj;
+      int xmin = static_cast<int>(width * outptr[2]);
+      int ymin = static_cast<int>(height * outptr[3]);
+      int xmax = static_cast<int>(width * outptr[4]);
+      int ymax = static_cast<int>(height * outptr[5]);
+      int w = xmax - xmin;
+      int h = ymax - ymin;
+      cv::Rect rec_clip =
+          cv::Rect(xmin, ymin, w, h) & cv::Rect(0, 0, width, height);
+      obj.rec = rec_clip;
+      detect_result.push_back(obj);
+    }
+    outptr += 6;
+  }
+
+  // Classification
+  config.set_model_dir(class_model_dir);
+
+  // Create Predictor For Classification Model
+  predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+  // Get Input Tensor
+  std::unique_ptr<Tensor> input_tensor1(std::move(predictor->GetInput(0)));
+  int classify_w = 128;
+  int classify_h = 128;
+  input_tensor1->Resize({1, 3, classify_h, classify_w});
+  auto* input_data = input_tensor1->mutable_data<float>();
+  int detect_num = detect_result.size();
+  std::vector<float> classify_mean = {0.5f, 0.5f, 0.5f};
+  std::vector<float> classify_scale = {1.f, 1.f, 1.f};
+  float classify_threshold = 0.5;
+  for (int i = 0; i < detect_num; ++i) {
+    cv::Rect rec_clip = detect_result[i].rec;
+    cv::Mat roi = img(rec_clip);
+
+    // Do PreProcess
+    pre_process(roi,
+                classify_w,
+                classify_h,
+                classify_mean,
+                classify_scale,
+                input_data,
+                true);
+
+    // Classification Model Run
+    predictor->Run();
+
+    // Get Output Tensor
+    std::unique_ptr<const Tensor> output_tensor1(
+        std::move(predictor->GetOutput(1)));
+    auto* outptr = output_tensor1->data<float>();
+
+    // Draw Detection and Classification Results
+    cv::rectangle(img, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
+    std::string text = outptr[1] > classify_threshold ? "wear mask" : "no mask";
+    int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+    double font_scale = 1.f;
+    int thickness = 1;
+    cv::Size text_size =
+        cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+    float new_font_scale = rec_clip.width * 0.7 * font_scale / text_size.width;
+    text_size =
+        cv::getTextSize(text, font_face, new_font_scale, thickness, nullptr);
+    cv::Point origin;
+    origin.x = rec_clip.x + 5;
+    origin.y = rec_clip.y + text_size.height + 5;
+    cv::putText(img,
+                text,
+                origin,
+                font_face,
+                new_font_scale,
+                cv::Scalar(0, 255, 255),
+                thickness,
+                cv::LINE_AA);
+
+    std::cout << "detect face, location: x=" << rec_clip.x
+              << ", y=" << rec_clip.y << ", width=" << rec_clip.width
+              << ", height=" << rec_clip.height
+              << ", wear mask: " << (outptr[1] > classify_threshold)
+              << std::endl;
+  }
+
+  // Write Result to Image File
+  int start = img_path.find_last_of("/");
+  int end = img_path.find_last_of(".");
+  std::string img_name = img_path.substr(start + 1, end - start - 1);
+  std::string result_name = img_name + "_mask_detection_result.jpg";
+  cv::imwrite(result_name, img);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 3) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " detction_model_dir classification_model_dir image_path\n";
+    exit(1);
+  }
+  std::string detect_model_dir = argv[1];
+  std::string classify_model_dir = argv[2];
+  std::string img_path = argv[3];
+  RunModel(detect_model_dir, classify_model_dir, img_path);
+  return 0;
+}
diff --git a/lite/demo/cxx/mobile_classify/mobile_classify.cc b/lite/demo/cxx/mobile_classify/mobile_classify.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d0cf59e185e1330b7d8487d562afa0af29236007
--- /dev/null
+++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc
@@ -0,0 +1,195 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+void load_labels(std::string path, std::vector<std::string>* labels) {
+  FILE* fp = fopen(path.c_str(), "r");
+  if (fp == nullptr) {
+    printf("load label file failed \n");
+    return;
+  }
+  while (!feof(fp)) {
+    char str[1024];
+    fgets(str, 1024, fp);
+    std::string str_s(str);
+
+    if (str_s.length() > 0) {
+      for (int i = 0; i < str_s.length(); i++) {
+        if (str_s[i] == ' ') {
+          std::string strr = str_s.substr(i, str_s.length() - i - 1);
+          labels->push_back(strr);
+          i = str_s.length();
+        }
+      }
+    }
+  }
+  fclose(fp);
+}
+
+void print_topk(const float* scores,
+                const int size,
+                const int topk,
+                const std::vector<std::string>& labels) {
+  std::vector<std::pair<float, int>> vec;
+  vec.resize(size);
+  for (int i = 0; i < size; i++) {
+    vec[i] = std::make_pair(scores[i], i);
+  }
+
+  std::partial_sort(vec.begin(),
+                    vec.begin() + topk,
+                    vec.end(),
+                    std::greater<std::pair<float, int>>());
+
+  // print topk and score
+  for (int i = 0; i < topk; i++) {
+    float score = vec[i].first;
+    int index = vec[i].second;
+    printf("i: %d, index: %d, name: %s, score: %f \n",
+           i,
+           index,
+           labels[index].c_str(),
+           score);
+  }
+}
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale) {
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+
+void pre_process(const cv::Mat& img,
+                 int width,
+                 int height,
+                 Tensor dstTensor,
+                 float* means,
+                 float* scales) {
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  float* data = dstTensor.mutable_data<float>();
+  neon_mean_scale(dimg, data, width * height, means, scales);
+}
+
+void RunModel(std::string model_dir,
+              std::string img_path,
+              const std::vector<std::string>& labels,
+              const int topk,
+              int width,
+              int height) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(model_dir);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  // 3. Prepare input data from image
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize({1, 3, height, width});
+  auto* data = input_tensor->mutable_data<float>();
+  // read img and pre-process
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  //   pre_process(img, width, height, data);
+  float means[3] = {0.485f, 0.456f, 0.406f};
+  float scales[3] = {0.229f, 0.224f, 0.225f};
+  pre_process(img, width, height, *input_tensor, means, scales);
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int64_t cnt = 1;
+  for (auto& i : shape_out) {
+    cnt *= i;
+  }
+  print_topk(outptr, cnt, topk, labels);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 4) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " model_dir image_path label_file\n";
+    exit(1);
+  }
+  printf("parameter:  model_dir, image_path and label_file are necessary \n");
+  printf("parameter:  topk, input_width,  input_height, are optional \n");
+  std::string model_dir = argv[1];
+  std::string img_path = argv[2];
+  std::string label_file = argv[3];
+  std::vector<std::string> labels;
+  load_labels(label_file, &labels);
+  int topk = 5;
+  int height = 224;
+  int width = 224;
+  if (argc > 4) {
+    topk = atoi(argv[4]);
+  }
+  if (argc > 6) {
+    width = atoi(argv[5]);
+    height = atoi(argv[6]);
+  }
+
+  RunModel(model_dir, img_path, labels, topk, width, height);
+  return 0;
+}
diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
index aa084d1fef7871ef11ac4864b30b3786691de387..0c9da1a76422edae45dfeec5d38556a5e2322a85 100644
--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -13,12 +13,10 @@
 // limitations under the License.
 
 #include <gflags/gflags.h>
-#include <stdio.h>
+#include <iostream>
 #include <vector>
-#include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#include "paddle_use_passes.h"   // NOLINT
+#include "paddle_api.h"         // NOLINT
+#include "paddle_use_passes.h"  // NOLINT
 
 using namespace paddle::lite_api;  // NOLINT
 
@@ -78,14 +76,22 @@ void RunModel() {
   // 6. Get output
   std::unique_ptr<const Tensor> output_tensor(
       std::move(predictor->GetOutput(0)));
-  printf("Output dim: %d\n", output_tensor->shape()[1]);
+  std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
   for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
-    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+    std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+              << std::endl;
   }
 }
 
 int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir == "" || FLAGS_optimized_model_dir == "") {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " --model_dir=<your-model-directory>"
+              << " --optimized_model_dir=<your-optmized-model-directory> "
+              << " --prefer_int8_kernel=[true|false]\n";
+    exit(1);
+  }
   RunModel();
   return 0;
 }
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
index e1833814cad17b2af182443874c69f4c91e542fc..9d923cb87da5244e4550be3fb6936a650ec9b53a 100644
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -12,27 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <gflags/gflags.h>
-#include <stdio.h>
+#include <iostream>
 #include <vector>
-#include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_api.h"  // NOLINT
 
 using namespace paddle::lite_api;  // NOLINT
 
-DEFINE_string(model_dir, "", "Model dir path.");
-
 int64_t ShapeProduction(const shape_t& shape) {
   int64_t res = 1;
   for (auto i : shape) res *= i;
   return res;
 }
 
-void RunModel() {
+void RunModel(std::string model_dir) {
   // 1. Set MobileConfig
   MobileConfig config;
-  config.set_model_dir(FLAGS_model_dir);
+  config.set_model_dir(model_dir);
+  // To load model transformed by opt after release/v2.3.0, plese use
+  // `set_model_from_file` listed below.
+  // config.set_model_from_file(model_dir);
 
   // 2. Create PaddlePredictor by MobileConfig
   std::shared_ptr<PaddlePredictor> predictor =
@@ -52,14 +50,19 @@ void RunModel() {
   // 5. Get output
   std::unique_ptr<const Tensor> output_tensor(
       std::move(predictor->GetOutput(0)));
-  printf("Output dim: %d\n", output_tensor->shape()[1]);
+  std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
   for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
-    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+    std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+              << std::endl;
   }
 }
 
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  RunModel();
+  if (argc < 2) {
+    std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  RunModel(model_dir);
   return 0;
 }
diff --git a/lite/demo/cxx/ssd_detection/ssd_detection.cc b/lite/demo/cxx/ssd_detection/ssd_detection.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2408afcbf64a24924eca119a9d9481dc030250c9
--- /dev/null
+++ b/lite/demo/cxx/ssd_detection/ssd_detection.cc
@@ -0,0 +1,209 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+struct Object {
+  int batch_id;
+  cv::Rect rec;
+  int class_id;
+  float prob;
+};
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+const char* class_names[] = {
+    "background", "aeroplane",   "bicycle", "bird",  "boat",
+    "bottle",     "bus",         "car",     "cat",   "chair",
+    "cow",        "diningtable", "dog",     "horse", "motorbike",
+    "person",     "pottedplant", "sheep",   "sofa",  "train",
+    "tvmonitor"};
+
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(const float* din,
+                     float* dout,
+                     int size,
+                     const std::vector<float> mean,
+                     const std::vector<float> scale) {
+  if (mean.size() != 3 || scale.size() != 3) {
+    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
+    exit(1);
+  }
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+
+void pre_process(const cv::Mat& img, int width, int height, float* data) {
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  std::vector<float> mean = {0.5f, 0.5f, 0.5f};
+  std::vector<float> scale = {0.5f, 0.5f, 0.5f};
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, data, width * height, mean, scale);
+}
+
+std::vector<Object> detect_object(const float* data,
+                                  int count,
+                                  float thresh,
+                                  cv::Mat& image) {  // NOLINT
+  if (data == nullptr) {
+    std::cerr << "[ERROR] data can not be nullptr\n";
+    exit(1);
+  }
+  std::vector<Object> rect_out;
+  for (int iw = 0; iw < count; iw++) {
+    int oriw = image.cols;
+    int orih = image.rows;
+    if (data[1] > thresh && static_cast<int>(data[0]) > 0) {
+      Object obj;
+      int x = static_cast<int>(data[2] * oriw);
+      int y = static_cast<int>(data[3] * orih);
+      int w = static_cast<int>(data[4] * oriw) - x;
+      int h = static_cast<int>(data[5] * orih) - y;
+      cv::Rect rec_clip =
+          cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows);
+      obj.batch_id = 0;
+      obj.class_id = static_cast<int>(data[0]);
+      obj.prob = data[1];
+      obj.rec = rec_clip;
+      if (w > 0 && h > 0 && obj.prob <= 1) {
+        rect_out.push_back(obj);
+        cv::rectangle(image, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
+        std::string str_prob = std::to_string(obj.prob);
+        std::string text = std::string(class_names[obj.class_id]) + ": " +
+                           str_prob.substr(0, str_prob.find(".") + 4);
+        int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+        double font_scale = 1.f;
+        int thickness = 2;
+        cv::Size text_size =
+            cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+        float new_font_scale = w * 0.35 * font_scale / text_size.width;
+        text_size = cv::getTextSize(
+            text, font_face, new_font_scale, thickness, nullptr);
+        cv::Point origin;
+        origin.x = x + 10;
+        origin.y = y + text_size.height + 10;
+        cv::putText(image,
+                    text,
+                    origin,
+                    font_face,
+                    new_font_scale,
+                    cv::Scalar(0, 255, 255),
+                    thickness,
+                    cv::LINE_AA);
+
+        std::cout << "detection, image size: " << image.cols << ", "
+                  << image.rows
+                  << ", detect object: " << class_names[obj.class_id]
+                  << ", score: " << obj.prob << ", location: x=" << x
+                  << ", y=" << y << ", width=" << w << ", height=" << h
+                  << std::endl;
+      }
+    }
+    data += 6;
+  }
+  return rect_out;
+}
+
+void RunModel(std::string model_dir, std::string img_path) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(model_dir);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  // 3. Prepare input data from image
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  const int in_width = 300;
+  const int in_height = 300;
+  input_tensor->Resize({1, 3, in_height, in_width});
+  auto* data = input_tensor->mutable_data<float>();
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  pre_process(img, in_width, in_height, data);
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int64_t cnt = ShapeProduction(shape_out);
+  auto rec_out = detect_object(outptr, static_cast<int>(cnt / 6), 0.6f, img);
+  int start = img_path.find_last_of("/");
+  int end = img_path.find_last_of(".");
+  std::string img_name = img_path.substr(start + 1, end - start - 1);
+  std::string result_name = img_name + "_ssd_detection_result.jpg";
+  cv::imwrite(result_name, img);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 3) {
+    std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  std::string img_path = argv[2];
+  RunModel(model_dir, img_path);
+  return 0;
+}
diff --git a/lite/demo/cxx/test_cv/README.md b/lite/demo/cxx/test_cv/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..36d2985a4fd4f243027f8caab9b6c5a8beb94cad
--- /dev/null
+++ b/lite/demo/cxx/test_cv/README.md
@@ -0,0 +1,131 @@
+# 图像预测库的使用
+1. 下载源码（https://github.com/PaddlePaddle/Paddle-Lite），打开LITE_WITH_CV=ON，编译full_publish模式
+example:
+```shell
+set BUILD_WITH_CV=ON or LITE_WITH_CV=ON
+./lite/tools/build.sh
+--arm_os=android
+--arm_abi=armv8
+--arm_lang=gcc
+--android_stl=c++_static
+full_publish
+```
+
+2. 准备模型和优化模型
+example:
+```shell
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+./lite/tools/build.sh build_optimize_tool
+./build.model_optimize_tool/lite/api/model_optimize_tool 
+--optimize_out_type=naive_buffer 
+--optimize_out=model_dir 
+--model_dir=model_dir
+--prefer_int8_kernel=false
+```
+
+3. 编译并运行完整test_model_cv demo
+example:
+```shell
+cd inference_lite_lib.android.armv8/demo/cxx/test_cv
+```
+
+- 修改MakeFile, 注释编译test_img_propress 语句
+    ```shell
+    test_model_cv: fetch_opencv test_model_cv.o
+            $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+    test_model_cv.o: test_model_cv.cc
+            $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+    #test_img_propress: fetch_opencv test_img_propress.o
+    #        $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress  $(CXX_LIBS) $(LDFLAGS)
+
+    #test_img_propress.o: test_img_propress.cc
+    #        $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc
+
+    .PHONY: clean
+    clean:
+            rm -f test_model_cv.o
+            rm -f test_model_cv
+            #rm -f test_img_propress.o
+            #rm -f test_img_propress
+    ```
+- 修改../../..//cxx/include/paddle_image_preprocess.h， 修改paddle_api.h头文件的路径
+    ```shell
+    origin:
+        #include "lite/api/paddle_api.h"
+        #include "lite/api/paddle_place.h"
+    now:
+        #include "paddle_api.h"
+        #include "paddle_place.h"
+    ```
+- 测试模型必须是优化后的模型
+
+```shell
+make
+
+adb -s device_id push mobilenet_v1 /data/local/tmp/
+adb -s device_id push test_model_cv /data/local/tmp/
+adb -s device_id push test.jpg /data/local/tmp/
+adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
+adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 "
+```
+运行成功将在控制台输出部分预测结果
+
+4. 编译并运行完整test_img_preprocess demo
+example:
+```shell
+cd inference_lite_lib.android.armv8/demo/cxx/test_cv
+```
+
+- 修改MakeFile, 注释编译test_model_cv 语句
+    ```shell
+    #test_model_cv: fetch_opencv test_model_cv.o
+    #        $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+    #test_model_cv.o: test_model_cv.cc
+    #        $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+    test_img_propress: fetch_opencv test_img_propress.o
+            $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress  $(CXX_LIBS) $(LDFLAGS)
+
+    test_img_propress.o: test_img_propress.cc
+            $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc
+
+    .PHONY: clean
+    clean:
+            #rm -f test_model_cv.o
+            #rm -f test_model_cv
+            rm -f test_img_propress.o
+            rm -f test_img_propress
+    ```
+- 修改../../..//cxx/include/paddle_image_preprocess.h， 修改paddle_api.h头文件的路径
+    ```shell
+    origin:
+        #include "lite/api/paddle_api.h"
+        #include "lite/api/paddle_place.h"
+    now:
+        #include "paddle_api.h"
+        #include "paddle_place.h"
+    ```
+- 测试模型必须是优化后的模型
+
+```shell
+make
+
+adb -s device_id push mobilenet_v1 /data/local/tmp/
+adb -s device_id push test_img_propress /data/local/tmp/
+adb -s device_id push test.jpg /data/local/tmp/
+adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
+adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1  "
+adb -s device_id pull /data/local/tmp/resize.jpg ./
+adb -s device_id pull /data/local/tmp/convert.jpg ./
+adb -s device_id pull /data/local/tmp/flip.jpg ./
+adb -s device_id pull /data/local/tmp/rotate.jpg ./
+```
+运行成功将在控制台输出OpenCV 和 Padlle-lite的耗时；同时，将在test_cv目录下看到生成的图像预处理结果图: 如：resize.jpg、convert.jpg等
diff --git a/lite/demo/cxx/test_cv/test_img_prepross.cc b/lite/demo/cxx/test_cv/test_img_prepross.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c2cbd66cc0a15a1032141641d83fbf8db85d20bf
--- /dev/null
+++ b/lite/demo/cxx/test_cv/test_img_prepross.cc
@@ -0,0 +1,389 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"               // NOLINT
+#include "paddle_image_preprocess.h"  // NOLINT
+#include "time.h"                     // NOLINT
+typedef paddle::lite_api::Tensor Tensor;
+typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
+typedef paddle::lite::utils::cv::FlipParam FlipParam;
+typedef paddle::lite::utils::cv::TransParam TransParam;
+typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
+typedef paddle::lite_api::DataLayoutType LayoutType;
+using namespace paddle::lite_api;  // NOLINT
+
+void fill_with_mat(cv::Mat& mat, uint8_t* src) {  // NOLINT
+  for (int i = 0; i < mat.rows; i++) {
+    for (int j = 0; j < mat.cols; j++) {
+      int tmp = (i * mat.cols + j) * 3;
+      cv::Vec3b& rgb = mat.at<cv::Vec3b>(i, j);
+      rgb[0] = src[tmp];
+      rgb[1] = src[tmp + 1];
+      rgb[2] = src[tmp + 2];
+    }
+  }
+}
+void test_img(std::vector<int> cluster_id,
+              std::vector<int> thread_num,
+              std::string img_path,
+              std::string dst_path,
+              ImageFormat srcFormat,
+              ImageFormat dstFormat,
+              int width,
+              int height,
+              float rotate,
+              FlipParam flip,
+              LayoutType layout,
+              std::string model_dir,
+              int test_iter = 1) {
+  // init
+  // paddle::lite::DeviceInfo::Init();
+  // read img and pre-process
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  float means[3] = {0.485f, 0.456f, 0.406f};
+  float scales[3] = {0.229f, 0.224f, 0.225f};
+  int srch = img.rows;
+  int srcw = img.cols;
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::cout << "cluster: " << cls << ", threads: " << th << std::endl;
+      // 1. Set MobileConfig
+      MobileConfig config;
+      config.set_model_dir(model_dir);
+      config.set_power_mode((PowerMode)cls);
+      config.set_threads(th);
+      std::cout << "model: " << model_dir;
+
+      // 2. Create PaddlePredictor by MobileConfig
+      std::shared_ptr<PaddlePredictor> predictor =
+          CreatePaddlePredictor<MobileConfig>(config);
+
+      // 3. Prepare input data from image
+      std::unique_ptr<Tensor> input_tensor(predictor->GetInput(0));
+
+      /*
+        imread(img_path, param)
+        IMREAD_UNCHANGED(<0) 表示加载原图，不做任何改变
+        IMREAD_GRAYSCALE ( 0)表示把原图作为灰度图像加载进来
+        IMREAD_COLOR (>0) 表示把原图作为RGB图像加载进来
+      */
+      cv::Mat img;
+      if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+        img = imread(img_path, cv::IMREAD_COLOR);
+      } else if (srcFormat == ImageFormat::GRAY) {
+        img = imread(img_path, cv::IMREAD_GRAYSCALE);
+      } else {
+        printf("this format %d does not support \n", srcFormat);
+        return;
+      }
+      if (img.empty()) {
+        std::cout << "opencv read image " << img_path.c_str() << " failed"
+                  << std::endl;
+        return;
+      }
+      int srch = img.rows;
+      int srcw = img.cols;
+      int dsth = height;
+      int dstw = width;
+
+      std::cout << " input tensor size, num= " << 1 << ", channel= " << 1
+                << ", height= " << srch << ", width= " << srcw
+                << ", srcFormat= " << (ImageFormat)srcFormat << std::endl;
+      // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12,
+      if (srcFormat == ImageFormat::GRAY) {
+        std::cout << "srcFormat: GRAY" << std::endl;
+      }
+      if (srcFormat == ImageFormat::BGR) {
+        std::cout << "srcFormat: BGR" << std::endl;
+      }
+      if (srcFormat == ImageFormat::RGB) {
+        std::cout << "srcFormat: RGB" << std::endl;
+      }
+      std::cout << " output tensor size, num=" << 1 << ", channel=" << 1
+                << ", height=" << dsth << ", width=" << dstw
+                << ", dstFormat= " << (ImageFormat)dstFormat << std::endl;
+
+      if (dstFormat == ImageFormat::GRAY) {
+        std::cout << "dstFormat: GRAY" << std::endl;
+      }
+      if (dstFormat == ImageFormat::BGR) {
+        std::cout << "dstFormat: BGR" << std::endl;
+      }
+      if (dstFormat == ImageFormat::RGB) {
+        std::cout << "dstFormat: RGB" << std::endl;
+      }
+
+      std::cout << "Rotate = " << rotate << ", Flip = " << flip
+                << ", Layout = " << static_cast<int>(layout) << std::endl;
+      if (static_cast<int>(layout) != 1 && static_cast<int>(layout) != 3) {
+        std::cout << "this layout" << static_cast<int>(layout)
+                  << " is no support" << std::endl;
+      }
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+        size = 3 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = img.data;
+
+      int out_size = srch * srcw;
+      int resize = dstw * dsth;
+      if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+        resize = 3 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+        resize = dsth * dstw;
+      }
+      // out
+      uint8_t* lite_dst = new uint8_t[out_size];
+      uint8_t* resize_tmp = new uint8_t[resize];
+      uint8_t* tv_out_ratote = new uint8_t[out_size];
+      uint8_t* tv_out_flip = new uint8_t[out_size];
+      std::vector<int64_t> shape_out = {1, 3, srch, srcw};
+
+      input_tensor->Resize(shape_out);
+      Tensor dst_tensor = *input_tensor;
+      std::cout << "opencv compute" << std::endl;
+      cv::Mat im_convert;
+      cv::Mat im_resize;
+      cv::Mat im_rotate;
+      cv::Mat im_flip;
+      double to_1 = 0;
+      double to_2 = 0;
+      double to_3 = 0;
+      double to_4 = 0;
+      double to1 = 0;
+      for (int i = 0; i < test_iter; i++) {
+        clock_t start = clock();
+        clock_t begin = clock();
+        // convert bgr-gray
+        if (dstFormat == srcFormat) {
+          im_convert = img;
+        } else if (dstFormat == ImageFormat::BGR &&
+                   srcFormat == ImageFormat::GRAY) {
+          cv::cvtColor(img, im_convert, cv::COLOR_GRAY2BGR);
+        } else if (srcFormat == ImageFormat::BGR &&
+                   dstFormat == ImageFormat::GRAY) {
+          cv::cvtColor(img, im_convert, cv::COLOR_BGR2GRAY);
+        } else if (dstFormat == srcFormat) {
+          printf("convert format error \n");
+          return;
+        }
+        clock_t end = clock();
+        to_1 += (end - begin);
+
+        begin = clock();
+        // resize default linear
+        cv::resize(im_convert, im_resize, cv::Size(dstw, dsth), 0.f, 0.f);
+        end = clock();
+        to_2 += (end - begin);
+
+        begin = clock();
+        // rotate 90
+        if (rotate == 90) {
+          cv::flip(im_convert.t(), im_rotate, 1);
+        } else if (rotate == 180) {
+          cv::flip(im_convert, im_rotate, -1);
+        } else if (rotate == 270) {
+          cv::flip(im_convert.t(), im_rotate, 0);
+        }
+        end = clock();
+        to_3 += (end - begin);
+
+        begin = clock();
+        // flip
+        cv::flip(im_convert, im_flip, flip);
+        end = clock();
+        to_4 += (end - begin);
+        clock_t ovet = clock();
+        to1 += (ovet - start);
+      }
+
+      std::cout << "Paddle-lite compute" << std::endl;
+      double lite_to = 0;
+      double lite_to_1 = 0;
+      double lite_to_2 = 0;
+      double lite_to_3 = 0;
+      double lite_to_4 = 0;
+      double lite_to_5 = 0;
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = dsth;
+      tparam.ow = dstw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        clock_t start = clock();
+        clock_t begin = clock();
+        image_preprocess.imageConvert(src, lite_dst);
+        clock_t end = clock();
+        lite_to_1 += (end - begin);
+
+        begin = clock();
+        image_preprocess.imageResize(lite_dst, resize_tmp);
+        end = clock();
+        lite_to_2 += (end - begin);
+
+        begin = clock();
+        image_preprocess.imageRotate(
+            lite_dst, tv_out_ratote, (ImageFormat)dstFormat, srcw, srch, 90);
+        end = clock();
+        lite_to_3 += (end - begin);
+
+        begin = clock();
+        image_preprocess.imageFlip(
+            lite_dst, tv_out_flip, (ImageFormat)dstFormat, srcw, srch, flip);
+        end = clock();
+        lite_to_4 += (end - begin);
+
+        clock_t over = clock();
+        lite_to += (over - start);
+
+        begin = clock();
+        image_preprocess.image2Tensor(lite_dst,
+                                      &dst_tensor,
+                                      (ImageFormat)dstFormat,
+                                      srcw,
+                                      srch,
+                                      layout,
+                                      means,
+                                      scales);
+        end = clock();
+        lite_to_5 += (end - begin);
+      }
+      to_1 = 1000 * to_1 / CLOCKS_PER_SEC;
+      to_2 = 1000 * to_2 / CLOCKS_PER_SEC;
+      to_3 = 1000 * to_3 / CLOCKS_PER_SEC;
+      to_4 = 1000 * to_4 / CLOCKS_PER_SEC;
+      to1 = 1000 * to1 / CLOCKS_PER_SEC;
+      std::cout << "opencv convert run time: " << to_1
+                << "ms, avg: " << to_1 / test_iter << std::endl;
+      std::cout << "opencv resize run time: " << to_2
+                << "ms, avg: " << to_2 / test_iter << std::endl;
+      std::cout << "opencv rotate run time: " << to_3
+                << "ms, avg: " << to_3 / test_iter << std::endl;
+      std::cout << "opencv flip  time: " << to_4
+                << "ms, avg: " << to_4 / test_iter << std::endl;
+      std::cout << "opencv total run time: " << to1
+                << "ms, avg: " << to1 / test_iter << std::endl;
+      std::cout << "------" << std::endl;
+
+      lite_to_1 = 1000 * lite_to_1 / CLOCKS_PER_SEC;
+      lite_to_2 = 1000 * lite_to_2 / CLOCKS_PER_SEC;
+      lite_to_3 = 1000 * lite_to_3 / CLOCKS_PER_SEC;
+      lite_to_4 = 1000 * lite_to_4 / CLOCKS_PER_SEC;
+      lite_to_5 = 1000 * lite_to_5 / CLOCKS_PER_SEC;
+      lite_to = 1000 * lite_to / CLOCKS_PER_SEC;
+      std::cout << "lite convert run time: " << lite_to_1
+                << "ms, avg: " << lite_to_1 / test_iter << std::endl;
+      std::cout << "lite resize run time: " << lite_to_2
+                << "ms, avg: " << lite_to_2 / test_iter << std::endl;
+      std::cout << "lite rotate run time: " << lite_to_3
+                << "ms, avg: " << lite_to_3 / test_iter << std::endl;
+      std::cout << "lite flip  time: " << lite_to_4
+                << "ms, avg: " << lite_to_4 / test_iter << std::endl;
+      std::cout << "lite total run time: " << lite_to
+                << "ms, avg: " << lite_to / test_iter << std::endl;
+      std::cout << "lite img2tensor  time: " << lite_to_5
+                << "ms, avg: " << lite_to_5 / test_iter << std::endl;
+      std::cout << "------" << std::endl;
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      // save_img
+      std::cout << "write image: " << std::endl;
+      std::string resize_name = dst_path + "/resize.jpg";
+      std::string convert_name = dst_path + "/convert.jpg";
+      std::string rotate_name = dst_path + "/rotate.jpg";
+      std::string flip_name = dst_path + "/flip.jpg";
+      cv::Mat resize_mat(dsth, dstw, CV_8UC3);
+      cv::Mat convert_mat(srch, srcw, CV_8UC3);
+      cv::Mat rotate_mat;
+      if (rotate == 90 || rotate == 270) {
+        rotate_mat = cv::Mat(srcw, srch, CV_8UC3);
+      } else {
+        rotate_mat = cv::Mat(srch, srcw, CV_8UC3);
+      }
+      cv::Mat flip_mat(srch, srcw, CV_8UC3);
+      fill_with_mat(resize_mat, resize_tmp);
+      fill_with_mat(convert_mat, lite_dst);
+      fill_with_mat(rotate_mat, tv_out_ratote);
+      fill_with_mat(flip_mat, tv_out_flip);
+      cv::imwrite(convert_name, convert_mat);
+      cv::imwrite(resize_name, resize_mat);
+      cv::imwrite(rotate_name, rotate_mat);
+      cv::imwrite(flip_name, flip_mat);
+      delete[] lite_dst;
+      delete[] resize_tmp;
+      delete[] tv_out_ratote;
+      delete[] tv_out_flip;
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 7) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " image_path dst_apth srcFormat dstFormat width height\n";
+    exit(1);
+  }
+  std::string image_path = argv[1];
+  std::string dst_path = argv[2];
+  int srcFormat = atoi(argv[3]);
+  int dstFormat = atoi(argv[4]);
+  int width = atoi(argv[5]);
+  int height = atoi(argv[6]);
+  int flip = -1;
+  float rotate = 90;
+  int layout = 1;
+  std::string model_dir = "mobilenet_v1";
+  if (argc > 7) {
+    model_dir = argv[7];
+  }
+  if (argc > 8) {
+    flip = atoi(argv[8]);
+  }
+  if (argc > 9) {
+    rotate = atoi(argv[9]);
+  }
+  if (argc > 10) {
+    layout = atoi(argv[10]);
+  }
+  test_img({3},
+           {1, 2, 4},
+           image_path,
+           dst_path,
+           (ImageFormat)srcFormat,
+           (ImageFormat)dstFormat,
+           width,
+           height,
+           rotate,
+           (FlipParam)flip,
+           (LayoutType)layout,
+           model_dir,
+           20);
+  return 0;
+}
diff --git a/lite/demo/cxx/test_cv/test_model_cv.cc b/lite/demo/cxx/test_cv/test_model_cv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24f408bf4a55ea2d499e39902201597c0e8c6e4e
--- /dev/null
+++ b/lite/demo/cxx/test_cv/test_model_cv.cc
@@ -0,0 +1,224 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"               // NOLINT
+#include "paddle_image_preprocess.h"  // NOLINT
+#include "time.h"                     // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale) {
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) {
+#ifdef LITE_WITH_CV
+  typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
+  typedef paddle::lite::utils::cv::FlipParam FlipParam;
+  typedef paddle::lite::utils::cv::TransParam TransParam;
+  typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
+  typedef paddle::lite_api::DataLayoutType LayoutType;
+  // init TransParam
+  TransParam tp;
+  tp.iw = img.cols;
+  tp.ih = img.rows;
+  tp.ow = width;
+  tp.oh = height;
+  ImageFormat srcFormat = ImageFormat::BGR;
+  ImageFormat dstFormat = ImageFormat::RGB;
+  // init ImagePreprocess
+  ImagePreprocess img_process(srcFormat, dstFormat, tp);
+  // init temp var
+  const uint8_t* img_ptr = reinterpret_cast<const uint8_t*>(img.data);
+  uint8_t* rgb_ptr = new uint8_t[img.cols * img.rows * 3];
+  uint8_t* resize_ptr = new uint8_t[width * height * 3];
+  // do convert bgr--rgb
+  img_process.imageConvert(img_ptr, rgb_ptr);
+  // do resize
+  img_process.imageResize(rgb_ptr, resize_ptr);
+  // data--tensor and normalize
+  float means[3] = {103.94f, 116.78f, 123.68f};
+  float scales[3] = {0.017f, 0.017f, 0.017f};
+  img_process.image2Tensor(
+      resize_ptr, &dstTensor, LayoutType::kNCHW, means, scales);
+  float* data = dstTensor.mutable_data<float>();
+#else
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  float means[3] = {0.485f, 0.456f, 0.406f};
+  float scales[3] = {0.229f, 0.224f, 0.225f};
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  float* data = dstTensor.mutable_data<float>();
+  neon_mean_scale(dimg, data, width * height, means, scales);
+#endif
+}
+
+void RunModel(std::string model_dir,
+              std::string img_path,
+              std::vector<int> input_shape,
+              PowerMode power_mode,
+              int thread_num,
+              int test_iter,
+              int warmup = 0) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+  // 3. Prepare input data from image
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(
+      {input_shape[0], input_shape[1], input_shape[2], input_shape[3]});
+  auto* data = input_tensor->mutable_data<float>();
+  // read img and pre-process
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+
+  pre_process(img, input_shape[3], input_shape[2], *input_tensor);
+
+  // 4. Run predictor
+  for (int i = 0; i < warmup; ++i) {
+    predictor->Run();
+  }
+  double lps = 0.f;
+  double min_time = 1000000.f;
+  double max_time = 0.f;
+  for (int i = 0; i < test_iter; ++i) {
+    clock_t begin = clock();
+    predictor->Run();
+    clock_t end = clock();
+    double t = (end - begin) * 1000;
+    t = t / CLOCKS_PER_SEC;
+    lps += t;
+    if (t < min_time) {
+      min_time = t;
+    }
+    if (t > max_time) {
+      max_time = t;
+    }
+    std::cout << "iter: " << i << ", time: " << t << " ms" << std::endl;
+  }
+  std::cout << "================== Speed Report ==================="
+            << std::endl;
+  std::cout << "Model: " << model_dir
+            << ", power_mode: " << static_cast<int>(power_mode)
+            << ", threads num " << thread_num << ", warmup: " << warmup
+            << ", repeats: " << test_iter << ", avg time: " << lps / test_iter
+            << " ms"
+            << ", min time: " << min_time << " ms"
+            << ", max time: " << max_time << " ms." << std::endl;
+
+  // 5. Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int output_num = 1;
+  for (int i = 0; i < shape_out.size(); ++i) {
+    output_num *= shape_out[i];
+  }
+  std::cout << "output_num: " << output_num << std::endl;
+  for (int i = 0; i < output_num; i += 100) {
+    std::cout << "i: " << i << ", out: " << outptr[i] << std::endl;
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 7) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " model_dir image_path input_shape\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  std::string img_path = argv[2];
+  std::vector<int> input_shape;
+  input_shape.push_back(atoi(argv[3]));
+  input_shape.push_back(atoi(argv[4]));
+  input_shape.push_back(atoi(argv[5]));
+  input_shape.push_back(atoi(argv[6]));
+  int power_mode = 3;
+  int threads = 1;
+  int test_iter = 100;
+  int warmup = 10;
+  if (argc > 7) {
+    power_mode = atoi(argv[7]);
+  }
+  if (argc > 8) {
+    threads = atoi(argv[8]);
+  }
+  if (argc > 9) {
+    test_iter = atoi(argv[9]);
+  }
+  if (argc > 10) {
+    warmup = atoi(argv[10]);
+  }
+  RunModel(model_dir,
+           img_path,
+           input_shape,
+           (PowerMode)power_mode,
+           threads,
+           test_iter,
+           warmup);
+  return 0;
+}
diff --git a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a9beb1ed28de1f3c28eb5c03b3b660d518ee10c5
--- /dev/null
+++ b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+struct Object {
+  cv::Rect rec;
+  int class_id;
+  float prob;
+};
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+const char* class_names[] = {"person",        "bicycle",      "car",
+                             "motorcycle",    "airplane",     "bus",
+                             "train",         "truck",        "boat",
+                             "traffic light", "fire hydrant", "stop sign",
+                             "parking meter", "bench",        "bird",
+                             "cat",           "dog",          "horse",
+                             "sheep",         "cow",          "elephant",
+                             "bear",          "zebra",        "giraffe",
+                             "backpack",      "umbrella",     "handbag",
+                             "tie",           "suitcase",     "frisbee",
+                             "skis",          "snowboard",    "sports ball",
+                             "kite",          "baseball bat", "baseball glove",
+                             "skateboard",    "surfboard",    "tennis racket",
+                             "bottle",        "wine glass",   "cup",
+                             "fork",          "knife",        "spoon",
+                             "bowl",          "banana",       "apple",
+                             "sandwich",      "orange",       "broccoli",
+                             "carrot",        "hot dog",      "pizza",
+                             "donut",         "cake",         "chair",
+                             "couch",         "potted plant", "bed",
+                             "dining table",  "toilet",       "tv",
+                             "laptop",        "mouse",        "remote",
+                             "keyboard",      "cell phone",   "microwave",
+                             "oven",          "toaster",      "sink",
+                             "refrigerator",  "book",         "clock",
+                             "vase",          "scissors",     "teddy bear",
+                             "hair drier",    "toothbrush"};
+
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(const float* din,
+                     float* dout,
+                     int size,
+                     const std::vector<float> mean,
+                     const std::vector<float> scale) {
+  if (mean.size() != 3 || scale.size() != 3) {
+    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
+    exit(1);
+  }
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+
+void pre_process(const cv::Mat& img, int width, int height, float* data) {
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(
+      rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
+  std::vector<float> scale = {0.229f, 0.224f, 0.225f};
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, data, width * height, mean, scale);
+}
+
+std::vector<Object> detect_object(const float* data,
+                                  int count,
+                                  float thresh,
+                                  cv::Mat& image) {  // NOLINT
+  if (data == nullptr) {
+    std::cerr << "[ERROR] data can not be nullptr\n";
+    exit(1);
+  }
+  std::vector<Object> rect_out;
+  for (int iw = 0; iw < count; iw++) {
+    int oriw = image.cols;
+    int orih = image.rows;
+    if (data[1] > thresh) {
+      Object obj;
+      int x = static_cast<int>(data[2]);
+      int y = static_cast<int>(data[3]);
+      int w = static_cast<int>(data[4] - data[2] + 1);
+      int h = static_cast<int>(data[5] - data[3] + 1);
+      cv::Rect rec_clip =
+          cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows);
+      obj.class_id = static_cast<int>(data[0]);
+      obj.prob = data[1];
+      obj.rec = rec_clip;
+      if (w > 0 && h > 0 && obj.prob <= 1) {
+        rect_out.push_back(obj);
+        cv::rectangle(image, rec_clip, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
+        std::string str_prob = std::to_string(obj.prob);
+        std::string text = std::string(class_names[obj.class_id]) + ": " +
+                           str_prob.substr(0, str_prob.find(".") + 4);
+        int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+        double font_scale = 1.f;
+        int thickness = 1;
+        cv::Size text_size =
+            cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+        float new_font_scale = w * 0.5 * font_scale / text_size.width;
+        text_size = cv::getTextSize(
+            text, font_face, new_font_scale, thickness, nullptr);
+        cv::Point origin;
+        origin.x = x + 3;
+        origin.y = y + text_size.height + 3;
+        cv::putText(image,
+                    text,
+                    origin,
+                    font_face,
+                    new_font_scale,
+                    cv::Scalar(0, 255, 255),
+                    thickness,
+                    cv::LINE_AA);
+
+        std::cout << "detection, image size: " << image.cols << ", "
+                  << image.rows
+                  << ", detect object: " << class_names[obj.class_id]
+                  << ", score: " << obj.prob << ", location: x=" << x
+                  << ", y=" << y << ", width=" << w << ", height=" << h
+                  << std::endl;
+      }
+    }
+    data += 6;
+  }
+  return rect_out;
+}
+
+void RunModel(std::string model_dir, std::string img_path) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(model_dir);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  const int in_width = 608;
+  const int in_height = 608;
+
+  // 3. Prepare input data from image
+  // input 0
+  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
+  input_tensor0->Resize({1, 3, in_height, in_width});
+  auto* data0 = input_tensor0->mutable_data<float>();
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  pre_process(img, in_width, in_height, data0);
+  // input1
+  std::unique_ptr<Tensor> input_tensor1(std::move(predictor->GetInput(1)));
+  input_tensor1->Resize({1, 2});
+  auto* data1 = input_tensor1->mutable_data<int>();
+  data1[0] = img.rows;
+  data1[1] = img.cols;
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int64_t cnt = 1;
+  for (auto& i : shape_out) {
+    cnt *= i;
+  }
+  auto rec_out = detect_object(outptr, static_cast<int>(cnt / 6), 0.5f, img);
+  std::string result_name =
+      img_path.substr(0, img_path.find(".")) + "_yolov3_detection_result.jpg";
+  cv::imwrite(result_name, img);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 3) {
+    std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  std::string img_path = argv[2];
+  RunModel(model_dir, img_path);
+  return 0;
+}
diff --git a/lite/fluid/eigen.h b/lite/fluid/eigen.h
index eac5332b53c857b05aacbfa95ee2e4b9fcd98a93..c3af7e9f6c3588f404c614430bf01f7ab5e099e5 100644
--- a/lite/fluid/eigen.h
+++ b/lite/fluid/eigen.h
@@ -30,13 +30,20 @@ struct EigenDim {
   using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
 
   static Type From(const lite::DDim& dims) {
-    PADDLE_ENFORCE(dims.size() == D, "D must match DDim::size");
+    PADDLE_ENFORCE_EQ(dims.size(), D, "D must match DDim::size");
     Type ret;
     for (size_t d = 0; d < dims.size(); d++) {
       ret[d] = dims[d];
     }
     return ret;
   }
+
+  static Type From(const DDim::value_type length) {
+    PADDLE_ENFORCE_EQ(D, 1, "D must be 1.");
+    Type ret;
+    ret[0] = length;
+    return ret;
+  }
 };
 
 // Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
@@ -52,7 +59,7 @@ struct EigenTensor {
   using ConstType =
       Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
 
-  static Type From(Tensor& tensor, lite::DDim dims) {  // NOLINT
+  static Type From(Tensor& tensor, const lite::DDim& dims) {  // NOLINT
     return Type(const_cast<T*>(tensor.data<T>()),
                 EigenDim<D>::From(dims));  // NOLINT
   }
@@ -61,7 +68,7 @@ struct EigenTensor {
     return From(tensor, tensor.dims());
   }  // NOLINT
 
-  static ConstType From(const Tensor& tensor, lite::DDim dims) {
+  static ConstType From(const Tensor& tensor, const lite::DDim& dims) {
     return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
   }
 
@@ -97,14 +104,15 @@ template <typename T,
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
   // Flatten reshapes a Tensor into an EigenVector.
   static typename EigenVector::Type Flatten(Tensor& tensor) {  // NOLINT
-    return EigenVector::From(
-        tensor, lite::DDim(std::vector<int64_t>({tensor.dims().production()})));
+    return typename EigenVector::Type(
+        const_cast<T*>(tensor.data<T>()),
+        EigenDim<1>::From(tensor.dims().production()));
   }
 
   static typename EigenVector::ConstType Flatten(
       const Tensor& tensor) {  // NOLINT
-    return EigenVector::From(
-        tensor, lite::DDim(std::vector<int64_t>({tensor.dims().production()})));
+    return typename EigenVector::ConstType(
+        tensor.data<T>(), EigenDim<1>::From(tensor.dims().production()));
   }
 };
 
diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt
index 0bfd39ae9a0bdf6e8af606711fd4dcc6011994b5..4e0092b392eb31ce81f2a410ea86002b343f0aec 100644
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(opencl)
 add_subdirectory(fpga)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(bm)
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 731df6e6629826016cafc386284a17f754f83ece..60d5e3b5e234ef19cd144100d07441eb4acf48de 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -1,3 +1,12 @@
+# NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
+# to the model_optimize_tool.
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
+    return()
+endif()
+
+message(STATUS "compile with lite ARM kernels")
+
+# 1. basic kernels for basic models
 # for conv op
 add_kernel(conv_depthwise ARM basic SRCS conv_depthwise.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(conv_direct ARM basic SRCS conv_direct.cc DEPS ${lite_kernel_deps} math_arm)
@@ -14,50 +23,65 @@ add_kernel(scale_compute_arm ARM basic SRCS scale_compute.cc DEPS ${lite_kernel_
 add_kernel(softmax_compute_arm ARM basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(batch_norm_compute_arm ARM basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(elementwise_compute_arm ARM basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(lrn_compute_arm ARM basic SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(decode_bboxes_compute_arm ARM basic SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
 add_kernel(pool_compute_arm ARM basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(split_compute_arm ARM basic SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(concat_compute_arm ARM basic SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(pad2d_compute_arm ARM basic SRCS pad2d_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(prior_box_compute_arm ARM basic SRCS prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(negative_compute_arm ARM basic SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(crop_compute_arm ARM basic SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(calib_compute_arm ARM basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(transpose_compute_arm ARM basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(power_compute_arm ARM basic SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(yolo_box_compute_arm ARM basic SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(shuffle_channel_compute_arm ARM basic SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(axpy_compute_arm ARM basic SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(conv_transpose_compute_arm ARM basic SRCS conv_transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(norm_compute_arm ARM basic SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(interpolate_compute_arm ARM basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_coder_compute_arm ARM basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(shape_compute_arm ARM basic SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(unsqueeze_compute_arm ARM extra SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(unsqueeze_compute_arm ARM basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(reduce_max_compute_arm ARM basic SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(sequence_expand_compute_arm ARM basic SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(im2sequence_compute_arm ARM basic SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(sequence_pool_compute_arm ARM basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(range_compute_arm ARM basic SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(layout_compute_arm ARM basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(instance_norm_compute_arm ARM basic SRCS instance_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(grid_sampler_compute_arm ARM basic SRCS grid_sampler_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
+## 2.other basic kernels: basic kernels that not used in basic models
+add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
+## 3. extra kernels
+add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_pool_compute_arm ARM extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(layer_norm_compute_arm ARM extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(gather_compute_arm ARM extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(reduce_mean_compute_arm ARM extra SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(stack_compute_arm ARM extra SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(affine_channel_compute_arm ARM extra SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(reduce_prod_compute_arm ARM extra SRCS reduce_prod_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(split_lod_tensor_compute_arm ARM extra SRCS split_lod_tensor_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(merge_lod_tensor_compute_arm ARM extra SRCS merge_lod_tensor_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(anchor_generator_compute_arm ARM extra SRCS anchor_generator_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(range_compute_arm ARM extra SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(assign_value_compute_arm ARM extra SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
 
 # for OCR specific
 add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -74,35 +98,28 @@ add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite
 add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(fill_constant_compute_arm ARM extra SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
-# NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
-# to the model_optimize_tool.
-if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
-    return()
-endif()
-
-message(STATUS "compile with lite ARM kernels")
 
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
 lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
 lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm)
-lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm)
-lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm)
 lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
 lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
 lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
 lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm)
-lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
 lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra)
 lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm)
-lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
-lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm)
-
+lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
 if(LITE_BUILD_EXTRA)
+    lite_cc_test(test_split_lod_tensor_compute_arm SRCS split_lod_tensor_compute_test.cc DEPS split_lod_tensor_compute_arm)
+    lite_cc_test(test_merge_lod_tensor_compute_arm SRCS merge_lod_tensor_compute_test.cc DEPS merge_lod_tensor_compute_arm)
+    lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm)
+    lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm)
+    lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
     lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm)
     lite_cc_test(test_lookup_table_compute_arm SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_arm)
 endif()
diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc
index bc274ea22485e84a1cc9145e62fc967f2847c5dd..266ae1fc916af4303aca274c39b9b4923fdbb154 100644
--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
@@ -56,6 +56,12 @@ void CastCompute::Run() {
     float* out_data = param.Out->mutable_data<float>();
     std::transform(
         x_data_begin, x_data_end, out_data, TransOp<unsigned char, float>);
+  } else if (param.in_dtype == 3 && param.out_dtype == 2) {
+    const int64_t* x_data_begin = param.X->data<int64_t>();
+    const int64_t* x_data_end = x_data_begin + param.X->numel();
+    int32_t* out_data = param.Out->mutable_data<int32_t>();
+    std::transform(
+        x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>);
   } else {
     LOG(FATAL) << "other has not been implemented";
   }
@@ -68,6 +74,6 @@ void CastCompute::Run() {
 
 REGISTER_LITE_KERNEL(
     cast, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::CastCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/collect_fpn_proposals_compute.cc b/lite/kernels/arm/collect_fpn_proposals_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d54b96348e866bbe16898ddd6fdbd45beb62afa0
--- /dev/null
+++ b/lite/kernels/arm/collect_fpn_proposals_compute.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/collect_fpn_proposals_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+struct ScoreWithID {
+  float score;
+  int batch_id;
+  int index;
+  int level;
+  ScoreWithID() {
+    batch_id = -1;
+    index = -1;
+    level = -1;
+  }
+  ScoreWithID(float score_, int batch_id_, int index_, int level_) {
+    score = score_;
+    batch_id = batch_id_;
+    index = index_;
+    level = level_;
+  }
+};
+
+static inline bool CompareByScore(ScoreWithID a, ScoreWithID b) {
+  return a.score >= b.score;
+}
+
+static inline bool CompareByBatchid(ScoreWithID a, ScoreWithID b) {
+  return a.batch_id < b.batch_id;
+}
+
+void CollectFpnProposalsCompute::Run() {
+  auto& param = Param<operators::CollectFpnProposalsParam>();
+  auto multi_layer_rois = param.multi_level_rois;
+  auto multi_layer_scores = param.multi_level_scores;
+  auto* fpn_rois = param.fpn_rois;
+  int post_nms_topN = param.post_nms_topN;
+
+  if (multi_layer_rois.size() != multi_layer_scores.size()) {
+    LOG(FATAL) << "multi_layer_rois.size() should be equan to "
+                  "multi_layer_scores.size()";
+  }
+
+  size_t num_fpn_level = multi_layer_rois.size();
+  std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0);
+  for (size_t i = 0; i < num_fpn_level; ++i) {
+    auto cur_rois_lod = multi_layer_rois[i]->lod().back();
+    integral_of_all_rois[i + 1] = static_cast<int>(
+        integral_of_all_rois[i] + cur_rois_lod[cur_rois_lod.size() - 1]);
+  }
+
+  std::vector<ScoreWithID> scores_of_all_rois(
+      integral_of_all_rois[num_fpn_level], ScoreWithID());
+  for (int i = 0; i < num_fpn_level; ++i) {
+    const float* cur_level_scores = multi_layer_scores[i]->data<float>();
+    int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i];
+    auto cur_scores_lod = multi_layer_scores[i]->lod().back();
+    int cur_batch_id = 0;
+    for (int j = 0; j < cur_level_num; ++j) {
+      if (j >= cur_scores_lod[cur_batch_id + 1]) {
+        cur_batch_id++;
+      }
+      int cur_index = j + integral_of_all_rois[i];
+      scores_of_all_rois[cur_index].score = cur_level_scores[j];
+      scores_of_all_rois[cur_index].index = j;
+      scores_of_all_rois[cur_index].level = i;
+      scores_of_all_rois[cur_index].batch_id = cur_batch_id;
+    }
+  }
+
+  // keep top post_nms_topN rois, sort the rois by the score
+  if (post_nms_topN > integral_of_all_rois[num_fpn_level]) {
+    post_nms_topN = integral_of_all_rois[num_fpn_level];
+  }
+  std::stable_sort(
+      scores_of_all_rois.begin(), scores_of_all_rois.end(), CompareByScore);
+  scores_of_all_rois.resize(post_nms_topN);
+  // sort by batch id
+  std::stable_sort(
+      scores_of_all_rois.begin(), scores_of_all_rois.end(), CompareByBatchid);
+  // create a pointer array
+  std::vector<const float*> multi_fpn_rois_data(num_fpn_level);
+  for (int i = 0; i < num_fpn_level; ++i) {
+    multi_fpn_rois_data[i] = multi_layer_rois[i]->data<float>();
+  }
+
+  // initialize the outputs
+  const int kBoxDim = 4;
+  auto fpn_rois_data = fpn_rois->mutable_data<float>();
+  std::vector<uint64_t> lod0(1, 0);
+  int cur_batch_id = 0;
+  for (int i = 0; i < post_nms_topN; ++i) {
+    int cur_fpn_level = scores_of_all_rois[i].level;
+    int cur_level_index = scores_of_all_rois[i].index;
+    std::memcpy(fpn_rois_data,
+                multi_fpn_rois_data[cur_fpn_level] + cur_level_index * kBoxDim,
+                kBoxDim * sizeof(float));
+    fpn_rois_data += kBoxDim;
+    if (scores_of_all_rois[i].batch_id != cur_batch_id) {
+      cur_batch_id = scores_of_all_rois[i].batch_id;
+      lod0.emplace_back(i);
+    }
+  }
+  lod0.emplace_back(post_nms_topN);
+  lite::LoD lod;
+  lod.emplace_back(lod0);
+  fpn_rois->set_lod(lod);
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(collect_fpn_proposals,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::CollectFpnProposalsCompute,
+                     def)
+    .BindInput("MultiLevelRois", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("MultiLevelScores", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("FpnRois", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/xpu/bridges/registry.cc b/lite/kernels/arm/collect_fpn_proposals_compute.h
similarity index 62%
rename from lite/kernels/xpu/bridges/registry.cc
rename to lite/kernels/arm/collect_fpn_proposals_compute.h
index 4ab1b69a25a29aeb1c1ceaff25525459ef2e94cd..f1e7448a07aee4f9c2b57a1c6d2223f4262c59b4 100644
--- a/lite/kernels/xpu/bridges/registry.cc
+++ b/lite/kernels/arm/collect_fpn_proposals_compute.h
@@ -12,30 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/xpu/bridges/registry.h"
-#include <utility>
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/axpy_op.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace xpu {
-namespace bridges {
+namespace arm {
 
-Factory& Factory::Instance() {
-  static Factory g_xpu_bridge;
-  return g_xpu_bridge;
-}
+class CollectFpnProposalsCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::CollectFpnProposalsParam;
 
-bool Factory::HasType(const std::string& op_type) const {
-  return map_.count(op_type);
-}
+  void Run() override;
 
-void Factory::Insert(const std::string& op_type, const func_type& func_name) {
-  map_.insert(std::make_pair(op_type, func_name));
-}
+  virtual ~CollectFpnProposalsCompute() = default;
+};
 
-}  // namespace bridges
-}  // namespace xpu
+}  // namespace arm
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/arm/compare_compute.cc b/lite/kernels/arm/compare_compute.cc
index 95014b4ccd427e152dfe919643afa5ff5eb3011d..6118cbc6e403645cada84d2434497b084636a4a3 100644
--- a/lite/kernels/arm/compare_compute.cc
+++ b/lite/kernels/arm/compare_compute.cc
@@ -112,6 +112,42 @@ void CompareCompute<Functor>::Run() {
   }
 }
 
+template <template <typename T> class Functor>
+void CompareCompute_int32<Functor>::Run() {
+  auto &param = this->Param<operators::CompareParam>();
+
+  using CompareFunctor = Functor<int>;
+
+  const size_t x_size = param.X->numel();
+  const size_t y_size = param.Y->numel();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  bool *z = param.Out->template mutable_data<bool>();
+  const auto *x = param.X->template data<int>();
+  const auto *y = param.Y->template data<int>();
+  auto axis = param.axis;
+  bool force_cpu = param.force_cpu;
+  if (x_size == y_size) {
+    for (int i = 0; i < x_size; ++i) {
+      z[i] = CompareFunctor()(x[i], y[i]);
+    }
+  } else {
+    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
+    int outer_num, mid_num, inner_num;
+    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
+    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
+      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
+        auto y_data = y[mid_id];
+        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
+          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
+          z[index] = CompareFunctor()(x[index], y_data);
+          // z[index] = x[index] < y_data;
+        }
+      }
+    }
+  }
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -183,3 +219,27 @@ REGISTER_LITE_KERNEL(greater_equal,
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(less_than,
+                     kARM,
+                     kInt32,
+                     kNCHW,
+                     paddle::lite::kernels::arm::CompareCompute_int32<
+                         paddle::lite::kernels::arm::_LessThanFunctor>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(equal,
+                     kARM,
+                     kInt32,
+                     kNCHW,
+                     paddle::lite::kernels::arm::CompareCompute_int32<
+                         paddle::lite::kernels::arm::_EqualFunctor>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .Finalize();
diff --git a/lite/kernels/arm/compare_compute.h b/lite/kernels/arm/compare_compute.h
index 65354022c6a5c0f1efbd179f0625f9451b327ab4..474a346a3d7bc922766976934c8d184b0fe4d373 100644
--- a/lite/kernels/arm/compare_compute.h
+++ b/lite/kernels/arm/compare_compute.h
@@ -33,8 +33,17 @@ class CompareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
   void Run() override;
 
   ~CompareCompute() {}
+};
+
+template <template <typename T> class Functor>
+class CompareCompute_int32
+    : public KernelLite<TARGET(kARM), PRECISION(kInt32)> {
+ public:
+  using param_t = operators::LogicalParam;
+
+  void Run() override;
 
- private:
+  ~CompareCompute_int32() {}
 };
 
 }  // namespace arm
diff --git a/lite/kernels/arm/conditional_block_compute.cc b/lite/kernels/arm/conditional_block_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0bd43e1300d4034241c03d3e4ce27dcaa59c1e5
--- /dev/null
+++ b/lite/kernels/arm/conditional_block_compute.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/conditional_block_compute.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void ConditionalBlockCompute::PrepareForRun() {
+  auto& param = Param<operators::ConditionalBlockParam>();
+  auto cur_scope = param.scope;
+
+  executor_ =
+      std::make_shared<CondExecutor>(param.sub_block, cur_scope, place());
+}
+void ConditionalBlockCompute::Run() {
+  auto& param = Param<operators::ConditionalBlockParam>();
+  for (auto& out : param.outs) {
+    out->clear();
+  }
+  bool need_run = true;
+  if (param.is_scalar_condition) {
+    auto* cond = param.cond;
+    auto* cond_data = cond->data<bool>();
+    need_run = cond_data[0];
+  } else {
+    auto x = param.x;
+    for (auto pt : x) {
+      if (pt == nullptr || !pt->IsInitialized() || pt->dims().empty()) {
+        need_run = false;
+        break;
+      }
+    }
+  }
+  if (need_run) {
+    executor_->Run();
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(conditional_block,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ConditionalBlockCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Cond", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Scope", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/conditional_block_compute.h b/lite/kernels/arm/conditional_block_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..91eadff931ec8aa54092347bcf18f8428130ef75
--- /dev/null
+++ b/lite/kernels/arm/conditional_block_compute.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/program.h"
+#include "lite/operators/conditional_block_op.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/basic_profiler.h"
+#include "lite/core/profile/precision_profiler.h"
+#include "lite/core/profile/profiler.h"
+#endif
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class CondExecutor {
+  typedef std::shared_ptr<OpLite> OpPtr;
+
+ public:
+  CondExecutor(cpp::BlockDesc *block, Scope *scope, Place place)
+      : scope_(scope), place_(place) {
+    int32_t op_size = block->OpsSize();
+    for (int32_t i = 0; i < op_size; ++i) {
+      auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
+      auto op_type = op_desc.Type();
+      auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
+      op_handler->Attach(op_desc, scope);
+
+      auto hostplace = place_;
+      hostplace.target = TARGET(kHost);
+      auto kernels = op_handler->CreateKernels({place_, hostplace});
+      CHECK_GT(kernels.size(), 0) << "cannot create kernel";
+      op_handler->AttachKernel(kernels[0].get());
+      op_handler->SetKernel(kernels);
+      ops_of_block_.push_back(op_handler);
+    }
+  }
+
+  void Run() {
+#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
+    lite::profile::Profiler profiler;
+#endif  // LITE_WITH_PRECISION_PROFILE
+#endif  // LITE_WITH_PROFILE
+    for (auto &op_handler : ops_of_block_) {
+      op_handler->CheckShape();
+      op_handler->InferShape();
+#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
+      std::unique_ptr<KernelBase> kernel(op_handler->GetKernel());
+      Instruction inst(op_handler, std::move(kernel));
+      inst.set_profiler(&profiler);
+#endif  // LITE_WITH_PRECISION_PROFILE
+#endif  // LITE_WITH_PROFILE
+      op_handler->Run();
+#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
+      LITE_PRECISION_PROFILE(inst)
+#endif  // LITE_WITH_PRECISION_PROFILE
+#endif  // LITE_WITH_PROFILE
+    }
+  }
+
+ private:
+  Scope *scope_;
+  Place place_;
+  std::vector<OpPtr> ops_of_block_;
+};
+
+class ConditionalBlockCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConditionalBlockParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~ConditionalBlockCompute() = default;
+
+ private:
+  std::shared_ptr<CondExecutor> executor_;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
index ebb96e21d5e856325b7abdb8342df2aea3d5b5c3..74083b3617f428e4f94f12498e337328d0f1a2a8 100644
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -32,12 +32,15 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   auto w_dims = param.filter->dims();
   auto& ctx = this->ctx_->template As<ARMContext>();
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int ic = w_dims[1] * param.groups;
   int oc = w_dims[0];
   int kh = w_dims[2];  // oihw
   int kw = w_dims[3];
-  int pad = param.paddings[0];
+  int pad = paddings[0];
   int stride = param.strides[0];
+  int threads = ctx.threads();
 
   int chin = param.x->dims()[1];
   int hin = param.x->dims()[2];
@@ -46,38 +49,33 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   int hout = param.output->dims()[2];
   int wout = param.output->dims()[3];
 
-  bool kps_equal = (param.paddings[0] == param.paddings[1]) &&
-                   (param.strides[0] == param.strides[1]) && (kw == kh);
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
-  bool flag_dw_3x3 = (kw == 3 && kh == 3 && (stride == 1 || stride == 2));
-  bool flag_dw_5x5 =
-      (kw == 5 && stride == 1) || (kw == 5 && stride == 2 && pad == 2);
+  bool pads_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+  bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
+
+  bool ks_equal = (param.strides[0] == param.strides[1]) && (kw == kh);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+
+  bool flag_dw_3x3 = (kw == 3) && (kh == 3) && (stride == 1 || stride == 2);
+  bool flag_dw_5x5 = (kw == 5) && (kh == 5) && (stride == 1 || stride == 2);
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
   /// select conv impl
-  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
-    /// dw conv impl
+  if (param.groups == ic && ic == oc && ks_equal && no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    VLOG(3) << "invoking dw conv";
-  } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
-             no_dilation) {
-    if (ic >= 32 && oc >= 32 && hout > 16 && wout > 16) {
-      /// winograd conv impl
-      impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
-      VLOG(3) << "invoking winograd conv";
-    } else {
-      /// direct conv impl
-      impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>;
-      VLOG(3) << "invoking direct conv";
-    }
+    // VLOG(3) << "invoking dw conv";
+  } else if (param.groups == 1 && kw == 3 && stride == 1 && ks_equal &&
+             no_dilation && pads_all_equal) {
+    // TODO(MyPandaShaoxiang): winograd conv support any pad
+    impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
+    // VLOG(3) << "invoking winograd conv";
   } else if (param.groups == 1 && kw == 3 && stride == 2 &&
-             chin * chout < 4 * hin * win && kps_equal && no_dilation) {
-    /// direct conv impl
+             chin * chout < 4 * hin * win && ks_equal && no_dilation) {
     impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    VLOG(3) << "invoking direct conv";
+    // VLOG(3) << "invoking direct conv";
   } else {
     impl_ = new GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    VLOG(3) << "invoking gemm like conv";
+    // VLOG(3) << "invoking gemm like conv";
   }
   impl_->SetContext(std::move(this->ctx_));
   impl_->SetParam(param);
@@ -92,31 +90,37 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
 
   auto& ctx = this->ctx_->template As<ARMContext>();
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  bool pads_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
   int ic = param.groups * w_dims[1];
   int oc = w_dims[0];
   int kh = w_dims[2];  // oihw
   int kw = w_dims[3];
-  int ph = param.paddings[1];
-  int pw = param.paddings[0];
+  int ph = paddings[0];
+  int pw = paddings[2];
   int sh = param.strides[1];
   int sw = param.strides[0];
+  bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
 
   bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
-  bool flag_dw_3x3 = (kw == 3 && kh == 3) && (sw == 1 || sw == 2);
-  bool flag_dw_5x5 = (kw == 5 && sw == 1);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
+  bool flag_dw_5x5 = pads_all_equal && (kw == 5 && (sw == 1 || sw == 2));
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
-  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
+  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
+      no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>;
-    VLOG(3) << "Run DepthwiseConv Int8";
+    // VLOG(3) << "Run DepthwiseConv Int8";
   } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
              kps_equal && no_dilation) {
     impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kFloat)>;
-    VLOG(3) << "Run DirectConv Int8";
+    // VLOG(3) << "Run DirectConv Int8";
   } else {
     impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>;
-    VLOG(3) << "Run GemmLikeConvInt8";
+    // VLOG(3) << "Run GemmLikeConvInt8";
   }
   impl_->SetContext(std::move(this->ctx_));
   impl_->SetParam(param);
@@ -130,32 +134,38 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
   auto w_dims = param.filter->dims();
 
   auto& ctx = this->ctx_->template As<ARMContext>();
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  bool pads_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
 
   int ic = w_dims[1] * param.groups;
   int oc = w_dims[0];
   int kh = w_dims[2];  // oihw
   int kw = w_dims[3];
-  int ph = param.paddings[1];
-  int pw = param.paddings[0];
+  int ph = paddings[0];
+  int pw = paddings[2];
   int sh = param.strides[1];
   int sw = param.strides[0];
+  bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
 
   bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
-  bool flag_dw_3x3 = (kw == 3 && kh == 3) && (sw == 1 || sw == 2);
-  bool flag_dw_5x5 = (kw == 5 && sw == 1);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
+  bool flag_dw_5x5 = pads_all_equal && (kw == 5 && (sw == 1 || sw == 2));
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
-  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
+  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
+      no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>;
-    VLOG(3) << "Run DepthwiseConv Int8";
+    // VLOG(3) << "Run DepthwiseConv Int8";
   } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
              kps_equal && no_dilation) {
     impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kInt8)>;
-    VLOG(3) << "Run DirectConv Int8";
+    // VLOG(3) << "Run DirectConv Int8";
   } else {
     impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>;
-    VLOG(3) << "Run GemmLikeConvInt8";
+    // VLOG(3) << "Run GemmLikeConvInt8";
   }
   impl_->SetContext(std::move(this->ctx_));
   impl_->SetParam(param);
@@ -194,7 +204,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, ConvFp32, def)
 
 REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
@@ -203,7 +213,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out)
 
 REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
@@ -213,7 +223,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out)
 REGISTER_LITE_KERNEL(
     depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
@@ -223,7 +233,7 @@ REGISTER_LITE_KERNEL(
 REGISTER_LITE_KERNEL(
     depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
diff --git a/lite/kernels/arm/conv_depthwise.cc b/lite/kernels/arm/conv_depthwise.cc
index 6a20d607e3a594c8eff83e1f872433f1c6025fd2..6f641d0f27ad3d0a1c19a667a0874a62f2d68116 100644
--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
@@ -28,25 +28,49 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   auto& ctx = this->ctx_->template As<ARMContext>();
   auto w_dims = param.filter->dims();
   auto kw = w_dims[3];
+  auto paddings = *param.paddings;
   // select dw conv kernel
   if (kw == 3) {
-    VLOG(5) << "invoke 3x3 dw conv fp32";
-    // trans weights
-    constexpr int cblock = 4;
-    auto oc = w_dims[0];
-    auto kh = w_dims[2];
-    auto cround = ROUNDUP(oc, cblock);
-    weights_.Resize({cround, 1, kh, kw});
-    // auto w_data = weights_.mutable_data<float>();
-    // auto w_data_in = param.filter->data<float>();
-    // lite::arm::math::conv_trans_weights_numc(
-    //    w_data_in, w_data, oc, 1, cblock, kh * kw);
+    // VLOG(5) << "invoke 3x3 dw conv fp32";
+    bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
+    if (pads_less && paddings[0] == paddings[2] &&
+        (paddings[0] == 0 || paddings[0] == 1)) {
+      flag_trans_weights_ = false;
+    } else {
+      // trans weights
+      constexpr int cblock = 4;
+      auto oc = w_dims[0];
+      auto kh = w_dims[2];
+      auto cround = ROUNDUP(oc, cblock);
+      weights_.Resize({cround, 1, kh, kw});
+      auto w_data = weights_.mutable_data<float>();
+      auto w_data_in = param.filter->data<float>();
+      lite::arm::math::conv_trans_weights_numc(
+          w_data_in, w_data, oc, 1, cblock, kh * kw);
+      flag_trans_weights_ = true;
+    }
     impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
-    flag_trans_weights_ = false;
-    // flag_trans_weights_ = true;
   } else if (kw == 5) {
-    VLOG(5) << "invoke 5x5 dw conv fp32";
-    impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
+    // VLOG(5) << "invoke 5x5 dw conv fp32";
+    auto strides = param.strides;
+    if ((strides[0] == 1 && strides[1] == 1) ||
+        (strides[0] == 2 && strides[1] == 2)) {
+      // trans weights
+      constexpr int cblock = 4;
+      auto oc = w_dims[0];
+      auto kh = w_dims[2];
+      auto cround = ROUNDUP(oc, cblock);
+      weights_.Resize({cround, 1, kh, kw});
+      auto w_data = weights_.mutable_data<float>();
+      auto w_data_in = param.filter->data<float>();
+      lite::arm::math::conv_trans_weights_numc(
+          w_data_in, w_data, oc, 1, cblock, kh * kw);
+      flag_trans_weights_ = true;
+      impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
+    } else {
+      LOG(FATAL)
+          << "5x5 depthwise conv only support stride == 1 or stride == 2";
+    }
   } else {
     LOG(FATAL) << "this type dw conv not impl";
   }
@@ -77,7 +101,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
   /// select dw conv kernel
   if (kw == 3) {
     // trans weights
-    VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
+    // VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
     impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32;
     int cround = ROUNDUP(w_dims[0], 8);
     weights_.Resize({cround / 8, 1, kh * kw, 8});
@@ -87,7 +111,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
     flag_trans_weights_ = true;
   } else if (kw == 5) {
     // trans weights
-    VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
+    // VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
     impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32;
     int cround = ROUNDUP(w_dims[0], 8);
     weights_.Resize({cround / 8, 1, kh * kw, 8});
@@ -136,7 +160,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
   /// select dw conv kernel
   if (kw == 3) {
     // trans weights
-    VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
+    // VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
     impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8;
     int cround = ROUNDUP(w_dims[0], 8);
     weights_.Resize({cround / 8, 1, kh * kw, 8});
@@ -146,7 +170,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
     flag_trans_weights_ = true;
   } else if (kw == 5) {
     // trans weights
-    VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
+    // VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
     impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8;
     int cround = ROUNDUP(w_dims[0], 8);
     weights_.Resize({cround / 8, 1, kh * kw, 8});
diff --git a/lite/kernels/arm/conv_gemmlike.h b/lite/kernels/arm/conv_gemmlike.h
index e00b8de6f4a66dfea91e8806821ba7cf3a9aa62b..5e59eb8d1790ab8845df3093ce7d86356b031034 100644
--- a/lite/kernels/arm/conv_gemmlike.h
+++ b/lite/kernels/arm/conv_gemmlike.h
@@ -52,12 +52,19 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
     int oc = o_dims[1];
     int kw = w_dims[3];
     int kh = w_dims[2];
+
+    auto paddings = *param.paddings;
+    auto dilations = *param.dilations;
+
     int sw = param.strides[1];
     int sh = param.strides[0];
-    int pw = param.paddings[1];
-    int ph = param.paddings[0];
-    int dw = param.dilations[1];
-    int dh = param.dilations[0];
+    int pw = paddings[2];
+    int ph = paddings[0];
+    int dw = dilations[1];
+    int dh = dilations[0];
+
+    bool pads_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
 
     int m = oc / param.groups;
     int k = ic * kh * kw / param.groups;
@@ -66,7 +73,7 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
     bool kps_equal = (pw == ph) && (sw == sh) && (kw == kh);
     bool ks_equal = (sw == sh) && (kw == kh);
     //! select conv gemmlike kernel
-    if (kw == 1 && sw == 1 && pw == 0 && kps_equal) {
+    if (kw == 1 && sw == 1 && pw == 0 && kps_equal && pads_equal) {
       //! 1x1s1p0 gemmlike conv
       flag_1x1gemm_ = true;
     } else {
diff --git a/lite/kernels/arm/conv_transpose_compute.cc b/lite/kernels/arm/conv_transpose_compute.cc
index 5a18499c85d682e0983493869e7d54de81641a99..6a6c903a43adeb2a92ed094585523998d7fff91e 100644
--- a/lite/kernels/arm/conv_transpose_compute.cc
+++ b/lite/kernels/arm/conv_transpose_compute.cc
@@ -76,24 +76,34 @@ void Conv2DTransposeCompute::Run() {
   bool fuse_relu = param.fuse_relu;
   bool flag_bias = (param.bias != nullptr);
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   int m = chout * kw * kh / group;
   int n = hin * win;
   int k = chin / group;
+
+  bool pads_equal =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+
   int group_size_in = win * hin * chin / group;
   int group_size_out = wout * hout * chout / group;
   int group_size_coldata = m * n;
+
+  bool pads_all_qual = pads_equal && (paddings[0] == paddings[2]);
   int hblock = lite::arm::math::get_hblock(&ctx);
   int m_roundup = hblock * ((m + hblock - 1) / hblock);
   int group_size_weights = ((m_roundup * k + 15) / 16) * 16;
   bool flag_1x1s1p1 = (kw == 1) && (kh == 1) && (param.strides[0] == 1) &&
-                      (param.strides[1] == 1) && (param.paddings[0] == 0) &&
-                      (param.paddings[1] == 0) && (param.dilations[0] == 1) &&
-                      (param.dilations[1] == 1);
+                      (param.strides[1] == 1) && pads_all_qual &&
+                      (paddings[0] == 0) && (dilations[0] == 1) &&
+                      (dilations[1] == 1);
   ctx.ExtendWorkspace(sizeof(float) * group * m * n);
 
   auto din = param.x->data<float>();
   auto dout = param.output->mutable_data<float>();
   auto weights = param.filter->data<float>();
+  auto act_param = param.activation_param;
   for (int i = 0; i < num; i++) {
     const float* din_batch = din + i * chin * hin * win;
     float* dout_batch = dout + i * chout * hout * wout;
@@ -106,7 +116,9 @@ void Conv2DTransposeCompute::Run() {
       const float* din_group = din_batch + g * group_size_in;
       const float* weights_group = weights + g * group_size_weights;
       float* coldata_group = col_data + g * group_size_coldata;
-
+      if (flag_bias) {
+        act_param.has_active = false;
+      }
       lite::arm::math::sgemm_prepack(false,
                                      m,
                                      n,
@@ -119,7 +131,7 @@ void Conv2DTransposeCompute::Run() {
                                      n,
                                      nullptr,
                                      false,
-                                     fuse_relu && (!flag_bias),
+                                     act_param,
                                      &ctx);
     }
     if (!flag_1x1s1p1) {
@@ -129,12 +141,14 @@ void Conv2DTransposeCompute::Run() {
                                      wout,
                                      kh,
                                      kw,
-                                     param.paddings[0],
-                                     param.paddings[1],
+                                     paddings[0],
+                                     paddings[1],
+                                     paddings[2],
+                                     paddings[3],
                                      param.strides[0],
                                      param.strides[1],
-                                     param.dilations[0],
-                                     param.dilations[1],
+                                     dilations[0],
+                                     dilations[1],
                                      dout_batch);
     }
     if (flag_bias) {
diff --git a/lite/kernels/arm/conv_transpose_compute_test.cc b/lite/kernels/arm/conv_transpose_compute_test.cc
deleted file mode 100644
index 298c651d9f86a5fc3d527cd7b973b1a24160ec42..0000000000000000000000000000000000000000
--- a/lite/kernels/arm/conv_transpose_compute_test.cc
+++ /dev/null
@@ -1,371 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/conv_transpose_compute.h"
-#include <gtest/gtest.h>
-#include <cmath>
-#include <cstdlib>
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <typename type, typename type2>
-static void basic_gemm(int m,
-                       int n,
-                       int k,
-                       const type* a,
-                       const type* b,
-                       const type2* bias,
-                       type2* c,
-                       type2 alpha,
-                       type2 beta,
-                       bool trans_a = false,
-                       bool trans_b = false,
-                       bool flag_bias = false,
-                       bool flag_relu = false) {
-#pragma omp parallel for
-  for (int i = 0; i < m; ++i) {
-    type2 bias_data = (type2)0;
-    if (flag_bias) {
-      bias_data = bias[i];
-    }
-    for (int j = 0; j < n; ++j) {
-      type2 sum = static_cast<type2>(0);
-      for (int l = 0; l < k; ++l) {
-        type av;
-        type bv;
-        if (trans_a) {
-          av = a[l * m + i];
-        } else {
-          av = a[i * k + l];
-        }
-        if (trans_b) {
-          bv = b[j * k + l];
-        } else {
-          bv = b[l * n + j];
-        }
-        sum += av * bv;
-      }
-      type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data;
-      if (flag_relu) {
-        c[i * n + j] = tmp > (type2)0 ? tmp : (type2)0;
-      } else {
-        c[i * n + j] = tmp;
-      }
-    }
-  }
-}
-
-//! for float, dtype1 and type2 is float
-//! for int8, dytpe1 is char, dtype2 is int
-template <typename Dtype1, typename Dtype2>
-bool deconv_basic(const Dtype1* din,
-                  Dtype2* dout,
-                  int num,
-                  int chout,
-                  int hout,
-                  int wout,
-                  int chin,
-                  int hin,
-                  int win,
-                  const Dtype1* weights,
-                  const Dtype2* bias,
-                  int group,
-                  int kernel_w,
-                  int kernel_h,
-                  int stride_w,
-                  int stride_h,
-                  int dila_w,
-                  int dila_h,
-                  int pad_w,
-                  int pad_h,
-                  bool flag_bias,
-                  bool flag_relu) {
-  int m = chout * kernel_w * kernel_h / group;
-  int n = hin * win;
-  int k = chin / group;
-
-  if (chin != chout || group != chin) {
-    CHECK_OR_FALSE(chin % group == 0);
-    CHECK_OR_FALSE(chout % group == 0);
-  }
-
-  lite::Tensor workspace_tensor;
-  std::vector<int64_t> wt_shape = {1, 1, 1, group * m * n};
-  workspace_tensor.Resize(wt_shape);
-  auto* workspace_ptr = workspace_tensor.mutable_data<Dtype2>();
-
-  int group_size_in = win * hin * chin / group;
-  int group_size_out = wout * hout * chout / group;
-  int group_size_coldata = m * n;
-  int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
-  bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
-
-  for (int i = 0; i < num; ++i) {
-    const Dtype1* din_batch = din + i * chin * hin * win;
-    Dtype2* dout_batch = dout + i * chout * hout * wout;
-
-    Dtype2* col_data = workspace_ptr;
-    if (flag_1x1s1p1) {
-      col_data = dout_batch;
-    }
-    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata);
-    for (int g = 0; g < group; ++g) {
-      const Dtype1* din_group = din_batch + g * group_size_in;
-      const Dtype1* weights_group = weights + g * group_size_weights;
-      Dtype2* coldata_group = col_data + g * group_size_coldata;
-      basic_gemm<Dtype1, Dtype2>(m,
-                                 n,
-                                 k,
-                                 weights_group,
-                                 din_group,
-                                 nullptr,
-                                 coldata_group,
-                                 (Dtype2)1,
-                                 (Dtype2)0,
-                                 true,
-                                 false,
-                                 false,
-                                 (!flag_bias && flag_relu));
-    }
-    if (!flag_1x1s1p1) {
-      lite::arm::math::col2im(col_data,
-                              chout,
-                              hout,
-                              wout,
-                              kernel_h,
-                              kernel_w,
-                              pad_h,
-                              pad_w,
-                              stride_h,
-                              stride_w,
-                              dila_h,
-                              dila_w,
-                              dout_batch);
-    }
-    if (flag_bias) {
-      lite::arm::math::fill_bias_relu(
-          dout_batch, bias, chout, wout * hout, flag_bias, flag_relu);
-    }
-  }
-  return true;
-}
-
-template <typename Dtype1, typename Dtype2>
-void conv2d_transpose_compute_ref(const operators::ConvParam& param) {
-  const Dtype1* din = param.x->data<Dtype1>();
-  Dtype2* dout = param.output->mutable_data<Dtype2>();
-
-  int num = param.x->dims()[0];
-  int chout = param.output->dims()[1];
-  int hout = param.output->dims()[2];
-  int wout = param.output->dims()[3];
-
-  int chin = param.x->dims()[1];
-  int hin = param.x->dims()[2];
-  int win = param.x->dims()[3];
-
-  const Dtype1* weights = param.filter->mutable_data<Dtype1>();
-  Dtype2* bias = nullptr;
-  if (param.bias != nullptr) {
-    bias = param.bias->mutable_data<Dtype2>();
-  }
-
-  int group = param.groups;
-  int kernel_h = param.filter->dims()[2];
-  int kernel_w = param.filter->dims()[3];
-  int stride_h = param.strides[0];
-  int stride_w = param.strides[1];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
-  bool flag_bias = (param.bias != nullptr);
-  bool flag_relu = param.fuse_relu;
-
-  deconv_basic<float, float>(din,
-                             dout,
-                             num,
-                             chout,
-                             hout,
-                             wout,
-                             chin,
-                             hin,
-                             win,
-                             weights,
-                             bias,
-                             group,
-                             kernel_w,
-                             kernel_h,
-                             stride_w,
-                             stride_h,
-                             dila_w,
-                             dila_h,
-                             pad_w,
-                             pad_h,
-                             flag_bias,
-                             flag_relu);
-}
-
-TEST(conv2d_transpose_arm, retrive_op) {
-  auto op = KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-      "conv2d_transpose");
-  ASSERT_FALSE(op.empty());
-  ASSERT_TRUE(op.front());
-}
-
-TEST(conv2d_transpose_arm, init) {
-  Conv2DTransposeCompute compute;
-  ASSERT_EQ(compute.precision(), PRECISION(kFloat));
-  ASSERT_EQ(compute.target(), TARGET(kARM));
-}
-
-TEST(conv2d_transpose_arm, compute) {
-  DeviceInfo::Init();
-  for (auto n : {1, 2}) {
-    for (auto ic : {1, 3 /*, 128*/}) {
-      for (auto oc : {1, 3 /*, 128*/}) {
-        for (auto ih : {2, 8 /*, 56 , 112, 224, 512*/}) {
-          for (auto iw : {2, 8 /*, 56, 112, 224, 512*/}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto dilation : {1, 2}) {
-                  for (auto stride : {1, 2}) {
-                    for (auto padding : {0, 1, 2}) {
-                      for (auto ks : {2, 3, 5}) {
-                        for (auto group : {1, 2}) {
-                          // obtain shape
-                          if (ic % group != 0 || oc % group != 0) {
-                            group = 1;
-                          }
-                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
-                          std::vector<int64_t> filter_shape = {
-                              oc / group, ic, ks, ks};
-                          int oh = (ih - 1) * stride - 2 * padding +
-                                   dilation * (ks - 1) + 1;
-                          int ow = (iw - 1) * stride - 2 * padding +
-                                   dilation * (ks - 1) + 1;
-                          if (oh < 1 || ow < 1) {
-                            break;
-                          }
-                          std::vector<int64_t> output_shape = {n, oc, oh, ow};
-                          std::vector<int64_t> bias_shape = {1, oc, 1, 1};
-
-                          // define and resize tensor
-                          Tensor input;
-                          Tensor filter;
-                          Tensor filter_copy;
-                          Tensor bias;
-                          Tensor output;
-                          Tensor output_ref;
-                          input.Resize(input_shape);
-                          filter.Resize(filter_shape);
-                          filter_copy.Resize(filter_shape);
-                          output.Resize(output_shape);
-                          output_ref.Resize(output_shape);
-                          auto* input_data = input.mutable_data<float>();
-                          auto* filter_data = filter.mutable_data<float>();
-                          auto* filter_copy_data =
-                              filter_copy.mutable_data<float>();
-                          auto* output_data = output.mutable_data<float>();
-
-                          // initialize tensor
-                          for (int i = 0; i < input.dims().production(); i++) {
-                            float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                            input_data[i] = sign * static_cast<float>(i % 128);
-                          }
-                          for (int i = 0; i < filter.dims().production(); i++) {
-                            filter_data[i] =
-                                i /
-                                static_cast<float>(filter.dims().production());
-                            filter_copy_data[i] =
-                                i / static_cast<float>(
-                                        filter_copy.dims().production());
-                          }
-                          if (flag_bias) {
-                            bias.Resize(bias_shape);
-                            auto* bias_data = bias.mutable_data<float>();
-                            for (int i = 0; i < bias.dims().production(); i++) {
-                              bias_data[i] = static_cast<float>(i);
-                            }
-                          }
-
-                          // prepare kernel params and run
-                          std::unique_ptr<KernelContext> ctx(new KernelContext);
-                          ctx->As<ARMContext>();
-                          Conv2DTransposeCompute conv2d_transpose;
-                          conv2d_transpose.SetContext(std::move(ctx));
-                          operators::ConvParam param;
-                          param.x = &input;
-                          param.filter = &filter;
-                          param.output = &output;
-                          param.bias = nullptr;
-                          if (flag_bias) {
-                            bias.Resize(bias_shape);
-                            auto* bias_data = bias.mutable_data<float>();
-                            for (int i = 0; i < bias.dims().production(); i++) {
-                              bias_data[i] = static_cast<float>(i);
-                            }
-                            param.bias = &bias;
-                          }
-                          param.fuse_relu = flag_relu;
-                          param.paddings = std::vector<int>({padding, padding});
-                          param.strides = std::vector<int>({stride, stride});
-                          param.dilations =
-                              std::vector<int>({dilation, dilation});
-                          param.groups = group;
-                          conv2d_transpose.SetParam(param);
-                          conv2d_transpose.Launch();
-
-                          // invoking ref implementation and compare results
-                          param.filter = &filter_copy;
-                          param.output = &output_ref;
-                          conv2d_transpose_compute_ref<float, float>(param);
-                          auto* output_ref_data =
-                              output_ref.mutable_data<float>();
-                          for (int i = 0; i < output.dims().production(); i++) {
-                            EXPECT_NEAR(
-                                output_data[i], output_ref_data[i], 1e-3);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc
index d1b8d8a48ecd7d564947486ee2938d6b630c41e5..c5cf0b237fc0548ac2bb7549d3950b3cead2b74c 100644
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -26,6 +26,7 @@ template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<ARMContext>();
+  int threads = ctx.threads();
 
   auto x_dims = param.x->dims();
   auto w_dims = param.filter->dims();
@@ -36,77 +37,65 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   }
 
   int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
+  int ih = x_dims[2];
+  int iw = x_dims[3];
   int oc = o_dims[1];
-  int tile_w = (ow + 5) / 6;
-  int tile_h = (oh + 5) / 6;
-  int size_tile = tile_h * tile_w;
-  int size_trans_channel = 8 * 8 * size_tile;
-  int max_ch = ic > oc ? ic : oc;
+  int oh = o_dims[2];
+  int ow = o_dims[3];
+  int tile_block = 8;
+  choose_small_ = ow * oh / (tile_block * threads) < 36 ? true : false;
+  if (choose_small_) {
+    wino_iw = 4;
+
+    if (last_function_ == 0) {
+      return;
+    }
+    last_function_ = 0;
+  } else {
+    wino_iw = 8;
+    if (last_function_ == 1) {
+      return;
+    }
+    last_function_ = 1;
+  }
+  auto pad = *(param.paddings);
+  int pad_h = pad[0];
+  int pad_w = pad[2];
+  int oc_pad = (oc + 3) / 4 * 4;
+  int ic_pad = (ic + 3) / 4 * 4;
+  const int new_input_size =
+      (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2);
+  const int temp_size =
+      (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 4 * wino_iw * wino_iw +
+       8 * wino_iw * wino_iw) *
+      threads;
+  ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
+
+  weights_.Resize({1, 1, 1, wino_iw * wino_iw * oc_pad * ic_pad});
+  ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
+  void* trans_tmp_ptr = malloc(sizeof(float) * wino_iw * wino_iw * oc * ic);
+  auto weights_data_ = weights_.mutable_data<float>();
+  if (!choose_small_) {
+    lite::arm::math::weight_trans_c4_8x8(
+        weights_data_, param.filter->data<float>(), ic, oc, trans_tmp_ptr);
+  } else {
+    lite::arm::math::weight_trans_c4_4x4(
+        weights_data_, param.filter->data<float>(), ic, oc, trans_tmp_ptr);
+  }
+  free(trans_tmp_ptr);
 
-  const int n_wino = size_tile;
-  workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float);
   last_shape_ = x_dims;
 }
 
 template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-  last_shape_ = x_dims;
-
-  int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
-  int oc = o_dims[1];
-  int tile_w = (ow + 5) / 6;
-  int tile_h = (oh + 5) / 6;
-  int size_tile = tile_h * tile_w;
-  int size_trans_channel = 8 * 8 * size_tile;
-  int max_ch = ic > oc ? ic : oc;
-
-  const int m_wino = oc;
-  const int n_wino = size_tile;
-  int hblock = lite::arm::math::get_hblock(&ctx);
-  int m_round = hblock * ((m_wino + hblock - 1) / hblock);
-  weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic});
-  workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float);
-  auto weights_wino =
-      static_cast<float*>(malloc(sizeof(float) * 8 * 8 * oc * ic));
-  void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
-  lite::arm::math::winograd_transform_weights(
-      weights_wino, param.filter->data<float>(), oc, ic, trans_tmp_ptr);
-  auto weights_trans = weights_.mutable_data<float>();
-  for (int i = 0; i < 64; ++i) {
-    float* packed_weights = weights_trans + i * m_round * ic;
-    const float* weights_wino_ptr = weights_wino + i * oc * ic;
-    lite::arm::math::prepackA(packed_weights,
-                              weights_wino_ptr,
-                              1.f,
-                              ic,
-                              0,
-                              m_wino,
-                              0,
-                              ic,
-                              false,
-                              &ctx);
-  }
-  free(trans_tmp_ptr);
-  free(weights_wino);
+  ReInitWhenNeeded();
 }
 
 template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<ARMContext>();
-  // extend workspace
-  ctx.ExtendWorkspace(workspace_size_);
-
   const auto* i_data = param.x->data<float>();
   const auto* w_data = weights_.data<float>();
   const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
@@ -124,8 +113,54 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   int ow = o_dims[3];
   int oc = o_dims[1];
 
-  lite::arm::math::conv_winograd3x3(
-      i_data, o_data, bs, oc, oh, ow, ic, ih, iw, w_data, b_data, param, &ctx);
+  if (!choose_small_) {
+    lite::arm::math::conv_compute_6x6_3x3(i_data,
+                                          o_data,
+                                          bs,
+                                          oc,
+                                          oh,
+                                          ow,
+                                          ic,
+                                          ih,
+                                          iw,
+                                          w_data,
+                                          b_data,
+                                          param,
+                                          &ctx);
+  } else {
+    int tile_block = 8;
+    int block_count =
+        (((ow + 1) / 2) * ((oh + 1) / 2) + tile_block - 1) / tile_block;
+    if (block_count != 1) {
+      lite::arm::math::conv_compute_2x2_3x3(i_data,
+                                            o_data,
+                                            bs,
+                                            oc,
+                                            oh,
+                                            ow,
+                                            ic,
+                                            ih,
+                                            iw,
+                                            w_data,
+                                            b_data,
+                                            param,
+                                            &ctx);
+    } else {
+      lite::arm::math::conv_compute_2x2_3x3_small(i_data,
+                                                  o_data,
+                                                  bs,
+                                                  oc,
+                                                  oh,
+                                                  ow,
+                                                  ic,
+                                                  ih,
+                                                  iw,
+                                                  w_data,
+                                                  b_data,
+                                                  param,
+                                                  &ctx);
+    }
+  }
 }
 
 }  // namespace arm
diff --git a/lite/kernels/arm/conv_winograd.h b/lite/kernels/arm/conv_winograd.h
index 33f0edc017adca477b2e71964efdcaddb0ca3a08..1a184ac0ccae1967a2f77110ce2a6fb619cf2e8e 100644
--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
@@ -40,6 +40,9 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> {
   Tensor weights_;
   DDim last_shape_;
   int workspace_size_{0};
+  int last_function_{-1};
+  bool choose_small_{false};
+  int wino_iw{8};
 };
 
 }  // namespace arm
diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.cc b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0871a3e84b42c8bcabbad53a8e98dc1d220714fb
--- /dev/null
+++ b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/distribute_fpn_proposals_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+const int kBoxDim = 4;
+
+template <typename T>
+static inline T BBoxArea(const T* box, bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+void DistributeFpnProposalsCompute::Run() {
+  auto& param = Param<operators::DistributeFpnProposalsParam>();
+  const lite::Tensor* fpn_rois = param.fpn_rois;
+  std::vector<lite::Tensor*> multi_fpn_rois = param.multi_fpn_rois;
+  lite::Tensor* restore_index = param.restore_index;
+  int min_level = param.min_level;
+  int max_level = param.max_level;
+  int refer_level = param.refer_level;
+  int refer_scale = param.refer_scale;
+  int num_level = max_level - min_level + 1;
+
+  CHECK_EQ(fpn_rois->lod().size(), 1);
+  auto fpn_rois_lod = fpn_rois->lod().back();
+  int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
+  std::vector<int> target_level;
+  // record the number of rois in each level
+  std::vector<int> num_rois_level(num_level, 0);
+  std::vector<int> num_rois_level_integral(num_level + 1, 0);
+  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
+    auto fpn_rois_slice =
+        fpn_rois->Slice<float>(static_cast<int64_t>(fpn_rois_lod[i]),
+                               static_cast<int64_t>(fpn_rois_lod[i + 1]));
+    const float* rois_data = fpn_rois_slice.data<float>();
+    for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
+      // get the target level of current rois
+      float roi_scale = std::sqrt(BBoxArea(rois_data, false));
+      int tgt_lvl = std::floor(
+          std::log2(roi_scale / refer_scale + static_cast<float>(1e-6)) +
+          refer_level);
+      tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
+      target_level.push_back(tgt_lvl);
+      num_rois_level[tgt_lvl - min_level]++;
+      rois_data += kBoxDim;
+    }
+  }
+  // define the output rois
+  // pointer which point to each level fpn rois
+  std::vector<float*> multi_fpn_rois_data(num_level);
+  // lod0 which will record the offset information of each level rois
+  std::vector<std::vector<uint64_t>> multi_fpn_rois_lod0;
+  for (int i = 0; i < num_level; ++i) {
+    // allocate memory for each level rois
+    multi_fpn_rois[i]->Resize({num_rois_level[i], kBoxDim});
+    multi_fpn_rois_data[i] = multi_fpn_rois[i]->mutable_data<float>();
+    std::vector<uint64_t> lod0(1, 0);
+    multi_fpn_rois_lod0.push_back(lod0);
+    // statistic start point for each level rois
+    num_rois_level_integral[i + 1] =
+        num_rois_level_integral[i] + num_rois_level[i];
+  }
+  restore_index->Resize({fpn_rois_num, 1});
+  int* restore_index_data = restore_index->mutable_data<int>();
+  std::vector<int> restore_index_inter(fpn_rois_num, -1);
+  // distribute the rois into different fpn level by target level
+  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
+    Tensor fpn_rois_slice =
+        fpn_rois->Slice<float>(static_cast<int64_t>(fpn_rois_lod[i]),
+                               static_cast<int64_t>(fpn_rois_lod[i + 1]));
+    const float* rois_data = fpn_rois_slice.data<float>();
+    size_t cur_offset = fpn_rois_lod[i];
+    // std::vector<size_t > lod_offset[num_level];
+    for (int j = 0; j < num_level; j++) {
+      multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]);
+    }
+    for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
+      int lvl = target_level[cur_offset + j];
+      memcpy(multi_fpn_rois_data[lvl - min_level],
+             rois_data,
+             kBoxDim * sizeof(float));
+      multi_fpn_rois_data[lvl - min_level] += kBoxDim;
+      int index_in_shuffle = num_rois_level_integral[lvl - min_level] +
+                             multi_fpn_rois_lod0[lvl - min_level][i + 1];
+      restore_index_inter[index_in_shuffle] = cur_offset + j;
+      multi_fpn_rois_lod0[lvl - min_level][i + 1]++;
+      rois_data += kBoxDim;
+    }
+  }
+  for (int i = 0; i < fpn_rois_num; ++i) {
+    restore_index_data[restore_index_inter[i]] = i;
+  }
+  // merge lod information into LoDTensor
+  for (int i = 0; i < num_level; ++i) {
+    lite::LoD lod;
+    lod.emplace_back(multi_fpn_rois_lod0[i]);
+    multi_fpn_rois[i]->set_lod(lod);
+  }
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(distribute_fpn_proposals,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::DistributeFpnProposalsCompute,
+                     def)
+    .BindInput("FpnRois", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("MultiFpnRois", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("RestoreIndex", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.h b/lite/kernels/arm/distribute_fpn_proposals_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..e150b338de05f8411b7007f7e39f37ce6b4b5503
--- /dev/null
+++ b/lite/kernels/arm/distribute_fpn_proposals_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/distribute_fpn_proposals_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class DistributeFpnProposalsCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DistributeFpnProposalsParam;
+
+  void Run() override;
+
+  virtual ~DistributeFpnProposalsCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/elementwise_compute.cc b/lite/kernels/arm/elementwise_compute.cc
index 2e57b6a3b37c91845d75444333fb205683cfd81c..94c5e140bace0e08e962ac74b82a3f9b241adb11 100644
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -161,20 +161,21 @@ void ElementwiseSubActivationCompute::Run() {
   }
 }
 
-void ElementwiseMulCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
+template <typename T, PrecisionType PType>
+void ElementwiseMulCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ElementwiseParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
   int axis = param.axis;
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
   int pre, n, post;
   if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    lite::arm::math::elementwise_mul_broadcast(
+    lite::arm::math::elementwise_mul_broadcast<T>(
         x_data, y_data, out_data, pre, n, post);
   } else {
-    lite::arm::math::elementwise_mul(
+    lite::arm::math::elementwise_mul<T>(
         x_data, y_data, out_data, x_dims.production());
   }
 }
@@ -347,17 +348,24 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(elementwise_mul,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseMulCompute,
-                     def)
+using elementwise_mul_float =
+    paddle::lite::kernels::arm::ElementwiseMulCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(
+    elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
+using elementwise_mul_int32 =
+    paddle::lite::kernels::arm::ElementwiseMulCompute<int, PRECISION(kInt32)>;
+REGISTER_LITE_KERNEL(
+    elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
     fusion_elementwise_mul_activation,
     kARM,
diff --git a/lite/kernels/arm/elementwise_compute.h b/lite/kernels/arm/elementwise_compute.h
index e76449aebcfa16317df99771f2b686d9a179ec25..731010a0d189c08f031363e6df95652c000a237b 100644
--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
@@ -54,8 +54,8 @@ class ElementwiseSubActivationCompute
   virtual ~ElementwiseSubActivationCompute() = default;
 };
 
-class ElementwiseMulCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ElementwiseMulCompute : public KernelLite<TARGET(kARM), PType> {
  public:
   void Run() override;
 
diff --git a/lite/kernels/arm/elementwise_compute_test.cc b/lite/kernels/arm/elementwise_compute_test.cc
index 2bc5863a181b9b46147dc090647588b5a4b178f2..b0ac3a7d33d92239c83147a3fe7615cd2fbf0249 100644
--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
@@ -329,13 +329,13 @@ TEST(elementwise_mul_arm, retrive_op) {
 }
 
 TEST(elementwise_mul_arm, init) {
-  ElementwiseMulCompute elementwise_mul;
+  ElementwiseMulCompute<float, PRECISION(kFloat)> elementwise_mul;
   ASSERT_EQ(elementwise_mul.precision(), PRECISION(kFloat));
   ASSERT_EQ(elementwise_mul.target(), TARGET(kARM));
 }
 
 TEST(elementwise_mul, compute) {
-  ElementwiseMulCompute elementwise_mul;
+  ElementwiseMulCompute<float, PRECISION(kFloat)> elementwise_mul;
   operators::ElementwiseParam param;
   lite::Tensor x, y, output, output_ref;
 
diff --git a/lite/kernels/arm/fc_compute.cc b/lite/kernels/arm/fc_compute.cc
index 525eca269bae22d27d078f6696efcfb8566270c5..1269a259072b6ae54759794f06040340cc42e15e 100644
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
@@ -93,7 +93,15 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
   }
+  bool flag_act = false;
+  lite_api::ActivationType act;
+  if (param.activation_type == "relu") {
+    act = lite_api::ActivationType::kRelu;
+    flag_act = true;
+  }
   if (flag_gemm_) {
+    operators::ActivationParam act_param;
+    act_param.has_active = false;
     lite::arm::math::sgemm(false,
                            false,
                            m_,
@@ -109,11 +117,11 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                            n_,
                            nullptr,
                            false,
-                           false,
+                           act_param,
                            &ctx);
     if (param.bias) {
       CHECK_EQ(param.bias->numel(), n_);
-      lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_);
+      lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_, flag_act);
     }
   } else {
     for (int i = 0; i < m_; ++i) {
@@ -127,7 +135,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                              k_,
                              param.bias != nullptr,
                              b_data,
-                             false,
+                             flag_act,
+                             act,
                              &ctx);
     }
   }
@@ -146,6 +155,10 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
   }
+  bool flag_relu = false;
+  if (param.activation_type == "relu") {
+    flag_relu = true;
+  }
   if (flag_gemm_) {
     lite::arm::math::gemm_s8(false,
                              false,
@@ -162,7 +175,7 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
                              &ctx);
     if (param.bias) {
       CHECK_EQ(param.bias->numel(), n_);
-      lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_);
+      lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_, flag_relu);
     }
   } else {
     for (int i = 0; i < m_; ++i) {
@@ -177,7 +190,7 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
                                  scale_.data(),
                                  param.bias != nullptr,
                                  b_data,
-                                 false,
+                                 flag_relu,
                                  &ctx);
     }
   }
@@ -196,6 +209,10 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
   }
+  bool flag_relu = false;
+  if (param.activation_type == "relu") {
+    flag_relu = true;
+  }
   if (flag_gemm_) {
     CHECK(!param.bias) << "fc int8 kernel with int8 output using gemm kernel "
                           "must not have bias";
@@ -209,7 +226,7 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                              o_data,
                              nullptr,
                              false,
-                             false,
+                             flag_relu,
                              scale_.data(),
                              &ctx);
   } else {
@@ -225,7 +242,7 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                                  scale_.data(),
                                  param.bias != nullptr,
                                  b_data,
-                                 false,
+                                 flag_relu,
                                  &ctx);
     }
   }
diff --git a/lite/kernels/arm/fc_compute.h b/lite/kernels/arm/fc_compute.h
index 2e5f2345e824b13d78a1575d3374652b8474c7fd..4f8a82a8689c1f221ee146176ff7074602cad1c9 100644
--- a/lite/kernels/arm/fc_compute.h
+++ b/lite/kernels/arm/fc_compute.h
@@ -95,7 +95,7 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> {
 
     CHECK_GE(x_dims.size(), 2UL);
     CHECK_EQ(w_dims.size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
+    CHECK_GE(param.output->dims().size(), 2UL);
 
     m_ = x_dims.Slice(0, param.in_num_col_dims).production();
     k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/arm/fill_constant_compute.cc
index 0b1911abf4fe553b670cf21dbb519c24dc08f184..ad475538576b9cc73a43bac49cba1a6cf1c73edb 100644
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -20,11 +20,42 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-template <typename T>
-class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::FillConstantParam;
 
+  inline DDimLite GetShape(const param_t& param) {
+    // 1. shape is a Tensor
+    if (param.shape_tensor != nullptr) {
+      auto* shape_tensor = param.shape_tensor;
+      auto* shape_data = shape_tensor->data<int>();
+      auto vec_shape =
+          std::vector<int64_t>(shape_data, shape_data + shape_tensor->numel());
+      return DDimLite(vec_shape);
+    }
+
+    // 2. shape is a list/tuple containing Tensor
+    auto shape_tensor_list = param.shape_tensor_list;
+    if (shape_tensor_list.size() > 0) {
+      std::vector<int64_t> vec_shape;
+      for (size_t i = 0; i < shape_tensor_list.size(); ++i) {
+        auto tensor = shape_tensor_list[i];
+        vec_shape.push_back(*tensor->data<int>());
+      }
+      return DDimLite(vec_shape);
+    }
+
+    // 3. shape is a list/tuple without containing Tensor
+    auto vec_shape = param.shape;
+    return DDimLite(vec_shape);
+  }
+
+  void PrepareForRun() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto outdims = GetShape(param);
+    param.Out->Resize(outdims);
+  }
+
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
     auto& context = ctx_->As<ARMContext>();
@@ -54,9 +85,8 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
   virtual ~FillConstantCompute() = default;
 };
 
-template <typename T>
 class FillConstantBatchLikeCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+    : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::FillConstantBatchLikeParam;
 
@@ -103,19 +133,23 @@ class FillConstantBatchLikeCompute
 // float
 REGISTER_LITE_KERNEL(fill_constant,
                      kARM,
-                     kFloat,
+                     kAny,
                      kNCHW,
-                     paddle::lite::kernels::arm::FillConstantCompute<float>,
+                     paddle::lite::kernels::arm::FillConstantCompute,
                      def)
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("ShapeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("ShapeTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
-REGISTER_LITE_KERNEL(
-    fill_constant_batch_size_like,
-    kARM,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::arm::FillConstantBatchLikeCompute<float>,
-    def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
+                     kARM,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::arm::FillConstantBatchLikeCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/gather_compute.cc b/lite/kernels/arm/gather_compute.cc
index a46a6f9d6ab4850506c681ac3ca80e23d18b97d4..c91b86e53f59deb362470f12ab55332ec9e96e8f 100644
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -29,7 +29,7 @@ void GatherCompute::Run() {
   auto index_size = param.Index->dims()[0];
   auto src_dims = param.X->dims();
   const float* p_src = param.X->data<float>();
-  const float* p_index = param.Index->data<float>();
+  const int* p_index = param.Index->data<int>();
 
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) {
@@ -50,6 +50,8 @@ void GatherCompute::Run() {
 
 REGISTER_LITE_KERNEL(
     gather, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/grid_sampler_compute.cc b/lite/kernels/arm/grid_sampler_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d0fc2545a539bf3ff25158e4c8b0114ede1cf574
--- /dev/null
+++ b/lite/kernels/arm/grid_sampler_compute.cc
@@ -0,0 +1,202 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/grid_sampler_compute.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void GridSamplerCompute::PrepareForRun() {}
+
+void GridSamplerCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto n = param.x->dims()[0];
+  auto c = param.x->dims()[1];
+  auto h = param.x->dims()[2];
+  auto w = param.x->dims()[3];
+  const float* in = param.x->data<float>();
+  const float* grid = param.grid->data<float>();
+  float* out = param.out->mutable_data<float>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  const size_t coor_size = n * h * w;
+  const size_t workspace_size = coor_size * 12 * sizeof(float);
+
+  ctx.ExtendWorkspace(workspace_size);
+  int32_t* coor_p = ctx.workspace_data<int>();
+  float* dis_p = reinterpret_cast<float*>(coor_p) + coor_size * 4;
+  uint32_t* bound_p = reinterpret_cast<uint32_t*>(dis_p) + coor_size * 4;
+
+  float x_max = static_cast<float>(w - 1);
+  float y_max = static_cast<float>(h - 1);
+  float32x4_t vxmax = vdupq_n_f32(x_max);
+  float32x4_t vymax = vdupq_n_f32(y_max);
+  float32x4_t vone = vdupq_n_f32(1.f);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  // compute coor, dis, bound
+  int i = coor_size;
+  for (; i > 3; i -= 4) {
+    float32x4x2_t xy = vld2q_f32(grid);
+    float32x4_t grid_x = vmulq_n_f32(vaddq_f32(xy.val[0], vone), 0.5 * x_max);
+    float32x4_t grid_y = vmulq_n_f32(vaddq_f32(xy.val[1], vone), 0.5 * y_max);
+    grid += 8;
+
+    // compute xw, we, yn, ys
+    int32x4x4_t vcoor;
+    vcoor.val[0] = vcvtq_s32_f32(grid_x);
+    vcoor.val[2] = vcvtq_s32_f32(grid_y);
+    float32x4_t vxwf = vcvtq_f32_s32(vcoor.val[0]);
+    float32x4_t vynf = vcvtq_f32_s32(vcoor.val[2]);
+    float32x4_t vxef = vaddq_f32(vxwf, vone);
+    float32x4_t vysf = vaddq_f32(vynf, vone);
+    vcoor.val[1] = vcvtq_s32_f32(vxef);
+    vcoor.val[3] = vcvtq_s32_f32(vysf);
+    vst4q_s32(coor_p, vcoor);
+    coor_p += 16;
+
+    // compute dw, dn ,de, ds
+    float32x4x4_t vdis;
+    vdis.val[0] = vsubq_f32(grid_x, vxwf);
+    vdis.val[2] = vsubq_f32(grid_y, vynf);
+    vdis.val[1] = vsubq_f32(vxef, grid_x);
+    vdis.val[3] = vsubq_f32(vysf, grid_y);
+    vst4q_f32(dis_p, vdis);
+    dis_p += 16;
+
+    // compute bound
+    uint32x4x4_t vbound;
+    uint32x4_t logic_xw =
+        vorrq_u32(vcltq_f32(vxwf, vzero), vcgtq_f32(vxwf, vxmax));
+    uint32x4_t logic_xe =
+        vorrq_u32(vcltq_f32(vxef, vzero), vcgtq_f32(vxef, vxmax));
+    uint32x4_t logic_yn =
+        vorrq_u32(vcltq_f32(vynf, vzero), vcgtq_f32(vynf, vymax));
+    uint32x4_t logic_ys =
+        vorrq_u32(vcltq_f32(vysf, vzero), vcgtq_f32(vysf, vymax));
+    vbound.val[0] = vmvnq_u32(vorrq_u32(logic_xw, logic_yn));
+    vbound.val[1] = vmvnq_u32(vorrq_u32(logic_xe, logic_yn));
+    vbound.val[2] = vmvnq_u32(vorrq_u32(logic_xw, logic_ys));
+    vbound.val[3] = vmvnq_u32(vorrq_u32(logic_xe, logic_ys));
+    vst4q_u32(bound_p, vbound);
+    bound_p += 16;
+  }
+
+  for (; i > 0; i--) {
+    float x = grid[0];
+    float y = grid[1];
+    float grid_x = (x + 1) * 0.5 * x_max;
+    float grid_y = (y + 1) * 0.5 * y_max;
+    grid += 2;
+
+    // compute xw, xe, yn, ys
+    int32_t xw = static_cast<int32_t>(floor(grid_x));
+    int32_t xe = xw + 1;
+    int32_t yn = static_cast<int32_t>(floor(grid_y));
+    int32_t ys = yn + 1;
+    *coor_p++ = xw;
+    *coor_p++ = xe;
+    *coor_p++ = yn;
+    *coor_p++ = ys;
+
+    // compute dw, de, dn, ds
+    float dw = grid_x - xw;
+    float de = xe - grid_x;
+    float dn = grid_y - yn;
+    float ds = ys - grid_y;
+    *dis_p++ = dw;
+    *dis_p++ = de;
+    *dis_p++ = dn;
+    *dis_p++ = ds;
+
+    // compute bound
+    bool logic_xw = (xw < 0.f || xw > x_max);
+    bool logic_xe = (xe < 0.f || xe > x_max);
+    bool logic_yn = (yn < 0.f || yn > y_max);
+    bool logic_ys = (ys < 0.f || ys > y_max);
+    *bound_p++ = ((logic_xw || logic_yn) ? 0 : 0xffffffff);
+    *bound_p++ = ((logic_xe || logic_yn) ? 0 : 0xffffffff);
+    *bound_p++ = ((logic_xw || logic_ys) ? 0 : 0xffffffff);
+    *bound_p++ = ((logic_xe || logic_ys) ? 0 : 0xffffffff);
+  }
+
+  size_t cube_size = c * h * w;
+  size_t spatial_size = h * w;
+  // compute output
+  for (int i = 0; i < n; ++i) {
+    const float* in_n = in + i * cube_size;
+    float* out_n = out + i * cube_size;
+    int32_t* coor_n = ctx.workspace_data<int>() + i * spatial_size * 4;
+    float* dis_n = reinterpret_cast<float*>(coor_n) + coor_size * 4;
+    uint32_t* bound_n = reinterpret_cast<uint32_t*>(dis_n) + coor_size * 4;
+#pragma omp parallel for
+    for (int j = 0; j < c; ++j) {
+      int32_t* coor_ptr = coor_n;
+      float* dis_ptr = dis_n;
+      uint32_t* bound_ptr = bound_n;
+      const float* in_c = in_n + j * spatial_size;
+      float* out_c = out_n + j * spatial_size;
+      for (int k = 0; k < spatial_size; k++) {
+        int32x4_t vcoor = vld1q_s32(coor_ptr);
+        float32x4_t vdis = vld1q_f32(dis_ptr);
+        int32_t xw = vgetq_lane_s32(vcoor, 0);
+        int32_t xe = vgetq_lane_s32(vcoor, 1);
+        int32_t yn = vgetq_lane_s32(vcoor, 2);
+        int32_t ys = vgetq_lane_s32(vcoor, 3);
+
+        uint32x4_t vbound = vld1q_u32(bound_ptr);
+        float dw = vgetq_lane_f32(vdis, 0);
+        float de = vgetq_lane_f32(vdis, 1);
+        float dn = vgetq_lane_f32(vdis, 2);
+        float ds = vgetq_lane_f32(vdis, 3);
+
+        uint32_t wnbound = vgetq_lane_u32(vbound, 0);
+        uint32_t enbound = vgetq_lane_u32(vbound, 1);
+        uint32_t wsbound = vgetq_lane_u32(vbound, 2);
+        uint32_t esbound = vgetq_lane_u32(vbound, 3);
+
+        float in_wn = wnbound ? in_c[yn * w + xw] : 0.f;
+        float in_en = enbound ? in_c[yn * w + xe] : 0.f;
+        float in_ws = wsbound ? in_c[ys * w + xw] : 0.f;
+        float in_es = esbound ? in_c[ys * w + xe] : 0.f;
+
+        coor_ptr += 4;
+        dis_ptr += 4;
+        bound_ptr += 4;
+        *out_c++ =
+            ds * (in_wn * de + in_en * dw) + dn * (in_ws * de + in_es * dw);
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(grid_sampler,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::GridSamplerCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Grid", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/grid_sampler_compute.h b/lite/kernels/arm/grid_sampler_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cc78a3fd0309fde70f2af39cd2c4c53127bda80
--- /dev/null
+++ b/lite/kernels/arm/grid_sampler_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class GridSamplerCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GridSamplerParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~GridSamplerCompute() = default;
+
+ private:
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/instance_norm_compute.cc b/lite/kernels/arm/instance_norm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3e82c53ac066ad6d2db25f675d90774aea4fe3a
--- /dev/null
+++ b/lite/kernels/arm/instance_norm_compute.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/instance_norm_compute.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void InstanceNormCompute::PrepareForRun() {}
+
+void InstanceNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  const float* in = param.x->data<float>();
+  const float* scale = param.scale->data<float>();
+  const float* bias = param.bias->data<float>();
+  float* out = param.out->mutable_data<float>();
+  float* saved_mean = param.saved_mean->mutable_data<float>();
+  float* saved_variance = param.saved_variance->mutable_data<float>();
+  float epsilon = param.epsilon;
+
+  int n = param.x->dims()[0];
+  int c = param.x->dims()[1];
+  int nc = n * c;
+  int height = param.x->dims()[2];
+  int width = param.x->dims()[3];
+  int spatial_size = height * width;
+// compute saved_mean and saved_variance
+#pragma omp parallel for
+  for (int i = 0; i < nc; ++i) {
+    const float* in_p = in + i * spatial_size;
+    float sum_spatial = 0.f;
+    float summ_spatial = 0.f;
+    for (int h = 0; h < height; ++h) {
+      int w = width;
+      float32x4_t sum0 = vdupq_n_f32(0.f);
+      float32x4_t sum1 = vdupq_n_f32(0.f);
+      float32x4_t sum2 = vdupq_n_f32(0.f);
+      float32x4_t sum3 = vdupq_n_f32(0.f);
+      float32x4_t summ0 = vdupq_n_f32(0.f);
+      float32x4_t summ1 = vdupq_n_f32(0.f);
+      float32x4_t summ2 = vdupq_n_f32(0.f);
+      float32x4_t summ3 = vdupq_n_f32(0.f);
+      float32x4_t in0, in1, in2, in3;
+      for (; w > 15; w -= 16) {
+        in0 = vld1q_f32(in_p);
+        in1 = vld1q_f32(in_p + 4);
+        in2 = vld1q_f32(in_p + 8);
+        in3 = vld1q_f32(in_p + 12);
+        sum0 = vaddq_f32(sum0, in0);
+        sum1 = vaddq_f32(sum1, in1);
+        summ0 = vmlaq_f32(summ0, in0, in0);
+        summ1 = vmlaq_f32(summ1, in1, in1);
+        sum2 = vaddq_f32(sum2, in2);
+        sum3 = vaddq_f32(sum3, in3);
+        summ2 = vmlaq_f32(summ2, in2, in2);
+        summ3 = vmlaq_f32(summ3, in3, in3);
+        in_p += 16;
+      }
+      for (; w > 7; w -= 8) {
+        in0 = vld1q_f32(in_p);
+        in1 = vld1q_f32(in_p + 4);
+        sum0 = vaddq_f32(sum0, in0);
+        sum1 = vaddq_f32(sum1, in1);
+        summ0 = vmlaq_f32(summ0, in0, in0);
+        summ1 = vmlaq_f32(summ1, in1, in1);
+        in_p += 8;
+      }
+      for (; w > 3; w -= 4) {
+        in0 = vld1q_f32(in_p);
+        sum0 = vaddq_f32(sum0, in0);
+        summ0 = vmlaq_f32(summ0, in0, in0);
+        in_p += 4;
+      }
+      float sum = 0.f;
+      float summ = 0.f;
+      for (; w > 0; w--) {
+        sum += *in_p;
+        summ += (*in_p) * (*in_p);
+        in_p++;
+      }
+      sum0 = vaddq_f32(sum0, sum1);
+      sum2 = vaddq_f32(sum2, sum3);
+      summ0 = vaddq_f32(summ0, summ1);
+      summ2 = vaddq_f32(summ2, summ3);
+      sum0 = vaddq_f32(sum0, sum2);
+      summ0 = vaddq_f32(summ0, summ2);
+      float32x2_t sum_low = vpadd_f32(vget_low_f32(sum0), vget_high_f32(sum0));
+      float32x2_t sum_high =
+          vpadd_f32(vget_low_f32(summ0), vget_high_f32(summ0));
+      float32x2_t sum_mix = vpadd_f32(sum_low, sum_high);
+      sum += vget_lane_f32(sum_mix, 0);
+      summ += vget_lane_f32(sum_mix, 1);
+      sum_spatial += sum;
+      summ_spatial += summ;
+    }
+    float mean = sum_spatial / spatial_size;
+    // float variance = summ / spatial_size - mean * mean;
+    // the flolowing code has higher precision than above comment code
+    float variance = (summ_spatial - mean * mean * spatial_size) / spatial_size;
+    float std = 1.f / sqrtf(variance + epsilon);
+
+    saved_mean[i] = mean;
+    saved_variance[i] = std;
+  }
+// compute instance_norm result: out = scale * (in - mean) / std + bias
+#pragma omp parallel for
+  for (int i = 0; i < nc; ++i) {
+    const float* in_p = in + i * spatial_size;
+    float* out_p = out + i * spatial_size;
+    int j = spatial_size;
+    const float sstd_val = scale[i % c] * saved_variance[i];
+    const float bias_val = bias[i % c];
+    const float mean_val = saved_mean[i];
+    const float32x4_t vsstd = vdupq_n_f32(sstd_val);
+    const float32x4_t vbias = vdupq_n_f32(bias_val);
+    const float32x4_t vmean = vdupq_n_f32(mean_val);
+    float32x4_t in0, in1, submean0, submean1, out0, out1;
+    for (; j > 7; j -= 8) {
+      in0 = vld1q_f32(in_p);
+      in1 = vld1q_f32(in_p + 4);
+      submean0 = vsubq_f32(in0, vmean);
+      submean1 = vsubq_f32(in1, vmean);
+      out0 = vmlaq_f32(vbias, submean0, vsstd);
+      out1 = vmlaq_f32(vbias, submean1, vsstd);
+      vst1q_f32(out_p, out0);
+      vst1q_f32(out_p + 4, out1);
+      in_p += 8;
+      out_p += 8;
+    }
+    for (; j > 3; j -= 4) {
+      in0 = vld1q_f32(in_p);
+      submean0 = vsubq_f32(in0, vmean);
+      out0 = vmlaq_f32(vbias, submean0, vsstd);
+      vst1q_f32(out_p, out0);
+      in_p += 4;
+      out_p += 4;
+    }
+    for (; j > 0; j--) {
+      *out_p = (*in_p - mean_val) * sstd_val + bias_val;
+      in_p++;
+      out_p++;
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(instance_norm,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::InstanceNormCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/instance_norm_compute.h b/lite/kernels/arm/instance_norm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fc056a372282b28613b1dd248c7c3087be76ede
--- /dev/null
+++ b/lite/kernels/arm/instance_norm_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class InstanceNormCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::InstanceNormParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~InstanceNormCompute() = default;
+
+ private:
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/interpolate_compute.cc b/lite/kernels/arm/interpolate_compute.cc
index 0398dabeaee4c042b33ac5572b783b126bc8ddb4..760b2fcf0630a632d1f1bbaeda7760d2de25a7a4 100644
--- a/lite/kernels/arm/interpolate_compute.cc
+++ b/lite/kernels/arm/interpolate_compute.cc
@@ -84,8 +84,10 @@ REGISTER_LITE_KERNEL(bilinear_interp,
                      paddle::lite::kernels::arm::BilinearInterpCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
@@ -97,8 +99,10 @@ REGISTER_LITE_KERNEL(nearest_interp,
                      paddle::lite::kernels::arm::NearestInterpCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/layout_compute.cc b/lite/kernels/arm/layout_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc52c5ea3ee452033cfd3c7d559cb88b21ca48f6
--- /dev/null
+++ b/lite/kernels/arm/layout_compute.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/layout_compute.h"
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+#define NCHWTONHWC(type)                                                  \
+  auto& param = this->template Param<param_t>();                          \
+  auto input = param.x->template data<type>();                            \
+  auto input_dim = param.x->dims();                                       \
+  CHECK(input_dim.size() == 4)                                            \
+      << "NCHW to NHWC should guarantee that the input dims should be 4"; \
+  int n = input_dim[0];                                                   \
+  int c = input_dim[1];                                                   \
+  int h = input_dim[2];                                                   \
+  int w = input_dim[3];                                                   \
+  param.y->Resize({n, h, w, c});                                          \
+  auto output = param.y->template mutable_data<type>(TARGET(kARM));       \
+  if (c == 1) {                                                           \
+    memcpy(output, input, sizeof(type) * n * h * w);                      \
+    return;                                                               \
+  }                                                                       \
+  lite::arm::math::NCHW2NHWC<type>(n, c, h * w, input, output);
+
+#define NHWCTONCHW(type)                                                  \
+  auto& param = this->template Param<param_t>();                          \
+  auto input = param.x->template data<type>();                            \
+  auto input_dim = param.x->dims();                                       \
+  CHECK(input_dim.size() == 4)                                            \
+      << "NHWC to NCHW should guarantee that the input dims should be 4"; \
+  int n = input_dim[0];                                                   \
+  int h = input_dim[1];                                                   \
+  int w = input_dim[2];                                                   \
+  int c = input_dim[3];                                                   \
+  param.y->Resize({n, c, h, w});                                          \
+  auto output = param.y->template mutable_data<type>(TARGET(kARM));       \
+  if (c == 1) {                                                           \
+    memcpy(output, input, sizeof(type) * n * h * w);                      \
+    return;                                                               \
+  }                                                                       \
+  lite::arm::math::NHWC2NCHW<type>(n, c, h * w, input, output);
+
+template <>
+void NCHWToNHWCCompute<PRECISION(kFloat)>::Run() {
+  NCHWTONHWC(float);
+}
+
+template <>
+void NCHWToNHWCCompute<PRECISION(kInt8)>::Run() {
+  NCHWTONHWC(int8_t);
+}
+
+template <>
+void NHWCToNCHWCompute<PRECISION(kFloat)>::Run() {
+  NHWCTONCHW(float);
+}
+
+template <>
+void NHWCToNCHWCompute<PRECISION(kInt8)>::Run() {
+  NHWCTONCHW(int8_t);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kFloat)>
+    NCHW_fp32;
+typedef paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kInt8)>
+    NCHW_int8;
+typedef paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kFloat)>
+    NHWC_fp32;
+typedef paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kInt8)>
+    NHWC_int8;
+
+REGISTER_LITE_KERNEL(layout, kARM, kFloat, kNCHW, NCHW_fp32, nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout, kARM, kFloat, kNCHW, NHWC_fp32, nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout, kARM, kInt8, kNCHW, NCHW_int8, int8_nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout, kARM, kInt8, kNCHW, NHWC_int8, int8_nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, NCHW_fp32, nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, NHWC_fp32, nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, NCHW_int8, int8_nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, NHWC_int8, int8_nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/arm/layout_compute.h b/lite/kernels/arm/layout_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..13b8621029437ea18d960e9c22d53b7062983b8f
--- /dev/null
+++ b/lite/kernels/arm/layout_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+template <PrecisionType Ptype>
+class NCHWToNHWCCompute : public KernelLite<TARGET(kARM), Ptype> {
+ public:
+  using param_t = operators::LayoutParam;
+  void Run() override;
+  virtual ~NCHWToNHWCCompute() = default;
+};
+
+template <PrecisionType Ptype>
+class NHWCToNCHWCompute : public KernelLite<TARGET(kARM), Ptype> {
+ public:
+  using param_t = operators::LayoutParam;
+  void Run() override;
+  virtual ~NHWCToNCHWCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc
index ba58b378f4dda22fd78ce76b80bdbca8d8f284a3..af9426f3f4a7d9dd0d1260143b7b3e8aea15a034 100644
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
@@ -72,7 +72,7 @@ REGISTER_LITE_KERNEL(lookup_table,
                      paddle::lite::kernels::arm::LookupTableCompute,
                      def)
     .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
@@ -83,6 +83,6 @@ REGISTER_LITE_KERNEL(lookup_table_v2,
                      paddle::lite::kernels::arm::LookupTableCompute,
                      def)
     .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/matmul_compute.cc b/lite/kernels/arm/matmul_compute.cc
index d00a5bdc060431509b73b18336f41b3c688cbcf2..2841fa13f7a04026bc9040a8bd9fdc98dd7e149e 100644
--- a/lite/kernels/arm/matmul_compute.cc
+++ b/lite/kernels/arm/matmul_compute.cc
@@ -42,6 +42,9 @@ void MatMulCompute::Run() {
   float alpha = param.alpha;
   auto& ctx = this->ctx_->template As<ARMContext>();
 
+  operators::ActivationParam act_param;
+  act_param.has_active = false;
+
   if (x_dims.size() > 2 && y_dims.size() >= 2) {
     // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
     // x: [B, M, K], y: [K, N], out: [B, M, N]
@@ -97,7 +100,6 @@ void MatMulCompute::Run() {
     if (x_transpose) {
       x_data_trans = static_cast<float*>(malloc(sizeof(float) * x_inner));
     }
-
     if (y_dims.size() > 2) {
       for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
         lite::arm::math::sgemm(x_transpose,
@@ -115,7 +117,7 @@ void MatMulCompute::Run() {
                                ldc,
                                nullptr,
                                false,
-                               false,
+                               act_param,
                                &ctx);
       }
     } else {
@@ -135,7 +137,7 @@ void MatMulCompute::Run() {
                                ldc,
                                nullptr,
                                false,
-                               false,
+                               act_param,
                                &ctx);
       }
     }
@@ -200,7 +202,7 @@ void MatMulCompute::Run() {
                            ldc,
                            nullptr,
                            false,
-                           false,
+                           act_param,
                            &ctx);
   } else if (x_dims.size() > 2 && y_dims.size() == 1) {
     // x: [B, M, K], y: [K], out: [B, M]
@@ -231,8 +233,17 @@ void MatMulCompute::Run() {
       int ldb = n_;
       int ldc = n_;
       if (n_ == 1) {
-        lite::arm::math::sgemv(
-            x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
+        lite::arm::math::sgemv(x_data,
+                               y_data,
+                               o_data,
+                               false,
+                               m_,
+                               k_,
+                               false,
+                               nullptr,
+                               false,
+                               lite_api::ActivationType::kIndentity,
+                               &ctx);
         if (fabsf(alpha - 1.f) > 1e-8f) {
           for (size_t i = 0; i < param.Out->dims().production(); ++i) {
             o_data[i] *= alpha;
@@ -254,7 +265,7 @@ void MatMulCompute::Run() {
                                ldc,
                                nullptr,
                                false,
-                               false,
+                               act_param,
                                &ctx);
       }
     }
diff --git a/lite/kernels/arm/merge_lod_tensor_compute.cc b/lite/kernels/arm/merge_lod_tensor_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..938056376f0092d8adbf5d7756259a289c5c3687
--- /dev/null
+++ b/lite/kernels/arm/merge_lod_tensor_compute.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/merge_lod_tensor_compute.h"
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+void MergeLodTensorCompute::Run() {
+  auto &param = Param<operators::MergeLodTensorParam>();
+  const lite::Tensor *x = param.x;
+  const lite::Tensor *mask = param.mask;
+  const lite::Tensor *in_true = param.in_true;
+  const lite::Tensor *in_false = param.in_false;
+  lite::Tensor *out = param.out;
+  int level = param.level;
+
+  CHECK(in_true->IsInitialized() || in_false->IsInitialized());
+
+  auto &in_true_dim = in_true->dims();
+  auto &in_false_dim = in_false->dims();
+
+  // only merge the first dim
+  int64_t batch_size = 0;
+  std::vector<int64_t> out_shape;
+  if (in_true->IsInitialized()) {
+    batch_size += in_true->dims()[0];
+  }
+  if (in_false->IsInitialized()) {
+    batch_size += in_false->dims()[0];
+  }
+  out_shape.push_back(batch_size);
+  if (in_true->IsInitialized()) {
+    for (int i = 1; i < in_true_dim.size(); i++) {
+      out_shape.push_back(in_true_dim[i]);
+    }
+  } else {
+    for (int i = 1; i < in_false_dim.size(); i++) {
+      out_shape.push_back(in_false_dim[i]);
+    }
+  }
+  out->Resize(out_shape);
+
+  size_t base_num = static_cast<size_t>(out->numel() / batch_size);
+  auto *out_data = out->mutable_data<float>();
+  auto *out_lod = out->mutable_lod();
+  out_lod->clear();
+  auto &mask_dim = mask->dims();
+  auto *mask_data = mask->data<bool>();
+
+  size_t out_offset = 0;
+  size_t in_true_idx = 0;
+  size_t in_false_idx = 0;
+  for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+    const Tensor *input = nullptr;
+    size_t *in_idx = nullptr;
+    if (static_cast<int>(mask_data[i]) == 0) {
+      input = in_false;
+      in_idx = &in_false_idx;
+    } else {
+      input = in_true;
+      in_idx = &in_true_idx;
+    }
+    auto lod_and_offset = lite::arm::math::GetSubLoDAndAbsoluteOffset(
+        input->lod(), *in_idx, (*in_idx) + 1, 0);
+    auto &lod_length = lod_and_offset.first;
+
+    lite::arm::math::AppendLoD(out_lod, lod_length);
+
+    size_t start_offset = lod_and_offset.second.first;
+    size_t end_offset = lod_and_offset.second.second;
+
+    CHECK(end_offset >= start_offset);
+    size_t len = end_offset - start_offset;
+    if (len == 0) {
+      continue;
+    }
+    auto *in_src = input->data<float>() + base_num * start_offset;
+    auto *out_dest = out_data + base_num * out_offset;
+    size_t copy_num = base_num * len * sizeof(float);
+    memcpy(out_dest, in_src, copy_num);
+    out_offset += len;
+    (*in_idx) += 1;
+  }
+
+  for (size_t i = 0; i < level; i++) {
+    out_lod->insert(out_lod->begin(), x->lod()[i]);
+  }
+
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(merge_lod_tensor,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::MergeLodTensorCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Mask", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .BindInput("InTrue", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("InFalse", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/merge_lod_tensor_compute.h b/lite/kernels/arm/merge_lod_tensor_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..43d325d9b695da6dae229292e9ca8e73d8344444
--- /dev/null
+++ b/lite/kernels/arm/merge_lod_tensor_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/merge_lod_tensor_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class MergeLodTensorCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MergeLodTensorParam;
+
+  void Run() override;
+
+  virtual ~MergeLodTensorCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/merge_lod_tensor_compute_test.cc b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..914a58308bdf0d5c6d374d5f81ca38224941c85d
--- /dev/null
+++ b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/merge_lod_tensor_compute.h"
+#include <gtest/gtest.h>
+#include <cstdlib>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+TEST(merge_lod_tensor_arm, retrive_op) {
+  auto kernel =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "merge_lod_tensor");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(merge_lod_tensor_arm, init) {
+  MergeLodTensorCompute cpt;
+  ASSERT_EQ(cpt.precision(), PRECISION(kFloat));
+  ASSERT_EQ(cpt.target(), TARGET(kARM));
+}
+
+TEST(merge_lod_tensor_arm_0, compute) {
+  DeviceInfo::Init();
+  Tensor x;
+  Tensor mask;
+  Tensor in_true;
+  Tensor in_false;
+  Tensor out;
+  int level = 0;
+
+  // set dims and lod
+  mask.Resize({3, 1});
+
+  in_true.Resize({1, 1});
+  LoD in_true_lod;
+  std::vector<uint64_t> in_true_lod0 = {0, 1};
+  in_true_lod.push_back(in_true_lod0);
+  in_true.set_lod(in_true_lod);
+
+  in_false.Resize({4, 1});
+  LoD in_false_lod;
+  std::vector<uint64_t> in_false_lod0 = {0, 2, 4};
+  in_false_lod.push_back(in_false_lod0);
+  in_false.set_lod(in_false_lod);
+
+  // initialize data
+  auto* in_true_data = in_true.mutable_data<float>();
+  for (size_t i = 0; i < in_true.numel(); i++) {
+    in_true_data[i] = static_cast<float>(i);
+  }
+  auto* in_false_data = in_false.mutable_data<float>();
+  for (size_t i = 0; i < in_false.numel(); i++) {
+    in_false_data[i] = static_cast<float>(i + 1);
+  }
+  auto* mask_data = mask.mutable_data<bool>();
+  for (size_t i = 0; i < mask.numel(); i++) {
+    mask_data[i] = static_cast<bool>(i % 2);
+  }
+
+  // prepare kernel params and run to obtain output_data
+  MergeLodTensorCompute op;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+  op.SetContext(std::move(ctx));
+
+  operators::MergeLodTensorParam param;
+  param.x = &x;
+  param.mask = &mask;
+  param.in_true = &in_true;
+  param.in_false = &in_false;
+  param.out = &out;
+  param.level = level;
+  op.SetParam(param);
+  op.Launch();
+
+  auto* out_data = out.data<float>();
+  std::vector<float> out_ref = {1, 2, 0, 3, 4};
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref[i], 1e-5);
+  }
+}
+TEST(merge_lod_tensor_arm_1, compute) {
+  DeviceInfo::Init();
+  Tensor x;
+  Tensor mask;
+  Tensor in_true;
+  Tensor in_false;
+  Tensor out;
+  int level = 0;
+
+  // set dims and lod
+  mask.Resize({3, 1});
+
+  in_true.Resize({3, 3});
+  LoD in_true_lod = {{0, 1}, {0, 3}};
+  in_true.set_lod(in_true_lod);
+
+  in_false.Resize({6, 3});
+  LoD in_false_lod = {{0, 2, 4}, {0, 1, 3, 5, 6}};
+  in_false.set_lod(in_false_lod);
+
+  // initialize data
+  auto* in_true_data = in_true.mutable_data<float>();
+  for (size_t i = 0; i < in_true.numel(); i++) {
+    in_true_data[i] = static_cast<float>(i);
+  }
+  auto* in_false_data = in_false.mutable_data<float>();
+  for (size_t i = 0; i < in_false.numel(); i++) {
+    in_false_data[i] = static_cast<float>(i + 1);
+  }
+  auto* mask_data = mask.mutable_data<bool>();
+  for (size_t i = 0; i < mask.numel(); i++) {
+    mask_data[i] = static_cast<bool>(i % 2);
+  }
+
+  // prepare kernel params and run to obtain output_data
+  MergeLodTensorCompute op;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+  op.SetContext(std::move(ctx));
+
+  operators::MergeLodTensorParam param;
+  param.x = &x;
+  param.mask = &mask;
+  param.in_true = &in_true;
+  param.in_false = &in_false;
+  param.out = &out;
+  param.level = level;
+  op.SetParam(param);
+  op.Launch();
+
+  auto* out_data = out.data<float>();
+  std::vector<float> out_ref = {1,  2,  3,  4,  5,  6,  7,  8,  9,
+                                0,  1,  2,  3,  4,  5,  6,  7,  8,
+                                10, 11, 12, 13, 14, 15, 16, 17, 18};
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref[i], 1e-5);
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(merge_lod_tensor, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/mul_compute.cc b/lite/kernels/arm/mul_compute.cc
index debe9e907cadafd67e6be40f7e49ff12cb4d527e..1321d001fd1d8a30b179d73979c4164cbe8916e1 100644
--- a/lite/kernels/arm/mul_compute.cc
+++ b/lite/kernels/arm/mul_compute.cc
@@ -50,8 +50,17 @@ void MulCompute::Run() {
   k_ = x_w;
   auto& ctx = this->ctx_->template As<ARMContext>();
   if (n_ == 1) {
-    lite::arm::math::sgemv(
-        x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
+    lite::arm::math::sgemv(x_data,
+                           y_data,
+                           o_data,
+                           false,
+                           m_,
+                           k_,
+                           false,
+                           nullptr,
+                           false,
+                           lite_api::ActivationType::kIndentity,
+                           &ctx);
 
   } else {
     constexpr bool is_tranposed_y = false;
@@ -67,6 +76,8 @@ void MulCompute::Run() {
     if (is_tranposed_y) {
       ldb = k_;
     }
+    operators::ActivationParam act_param;
+    act_param.has_active = false;
     lite::arm::math::sgemm_prepack(is_tranposed_y,
                                    m_,
                                    n_,
@@ -79,7 +90,7 @@ void MulCompute::Run() {
                                    n_,
                                    nullptr,
                                    false,
-                                   false,
+                                   act_param,
                                    &ctx);
   }
 }
diff --git a/lite/kernels/arm/pool_compute.cc b/lite/kernels/arm/pool_compute.cc
index 9f02a462a517077f662dcc952780b6e34bfb95a4..7ff422256336832a68af52896ed1af2be13bf94e 100644
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
@@ -38,22 +38,27 @@ void PoolCompute::Run() {
 
   std::vector<int>& ksize = param.ksize;
   std::vector<int>& strides = param.strides;
-  std::vector<int>& paddings = param.paddings;
+  std::vector<int>& paddings = *param.paddings;
 
   std::string& pooling_type = param.pooling_type;
-  bool global_pooling = param.global_pooling;
   bool exclusive = param.exclusive;
   bool adaptive = param.adaptive;
   bool ceil_mode = param.ceil_mode;
   bool use_quantizer = param.use_quantizer;
   std::string& data_format = param.data_format;
 
-  bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) &&
-                   (paddings[0] == paddings[1]);
-
+  bool pads_equal = (paddings[0] == paddings[1]) &&
+                    (paddings[2] == paddings[3]) &&
+                    (paddings[0] == paddings[2]);
+  bool kps_equal =
+      (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && pads_equal;
+  bool global_pooling = (paddings[0] == 0) && (ksize[0] == in_dims[2]) &&
+                        (ksize[1] == in_dims[3]) && pads_equal;
+  global_pooling = param.global_pooling || global_pooling;
   if (global_pooling) {
     for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
       ksize[i] = static_cast<int>(in_dims[i + 2]);
     }
     if (pooling_type == "max") {
@@ -80,7 +85,22 @@ void PoolCompute::Run() {
       return;
     }
   } else {
-    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && kps_equal) {
+    if (ksize[0] == 1 && strides[0] == 2 && paddings[0] == 0 && kps_equal) {
+      auto& ctx = this->ctx_->template As<ARMContext>();
+      if (pooling_type == "max") {
+        lite::arm::math::pooling1x1s2p0_max(din,
+                                            dout,
+                                            out_dims[0],
+                                            out_dims[1],
+                                            out_dims[2],
+                                            out_dims[3],
+                                            in_dims[1],
+                                            in_dims[2],
+                                            in_dims[3]);
+        return;
+      }
+    } else if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 &&
+               kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling2x2s2_max(din,
                                           dout,
diff --git a/lite/kernels/arm/pool_compute_test.cc b/lite/kernels/arm/pool_compute_test.cc
index 79e5332172c9a488c83dd485f094250d71a1d5dc..7ed8a142dda06e2d1b8f9d8afdade0194d87d1e6 100644
--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/arm/pool_compute.h"
 #include <gtest/gtest.h>
 #include <limits>
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/backends/arm/math/funcs.h"
@@ -25,14 +26,21 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+int PoolOutputSize(int input_size,
+                   int filter_size,
+                   int pad_left,
+                   int pad_right,
+                   int stride,
+                   bool ceil_mode) {
   int output_size;
   if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
   } else {
     output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
   }
   return output_size;
 }
@@ -40,10 +48,12 @@ int PoolOutputSize(
 std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
   const auto x_dims = param_->x->dims();
   std::vector<int>& ksize = param_->ksize;
+  auto paddings = *param_->paddings;
   if (param_->global_pooling) {
     ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
       ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
@@ -56,7 +66,8 @@ std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
     for (size_t i = 0; i < param_->ksize.size(); ++i) {
       output_shape.push_back(PoolOutputSize(x_dims[i + 2],
                                             param_->ksize[i],
-                                            param_->paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                             param_->strides[i],
                                             param_->ceil_mode));
     }
@@ -73,7 +84,7 @@ void pool_compute_ref(const operators::PoolParam& param) {
 
   std::vector<int> ksize = param.ksize;
   std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
+  std::vector<int> paddings = *param.paddings;
 
   std::string pooling_type = param.pooling_type;
   bool global_pooling = param.global_pooling;
@@ -99,7 +110,7 @@ void pool_compute_ref(const operators::PoolParam& param) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int size_channel_in = win * hin;
   int size_channel_out = wout * hout;
   if (global_pooling) {
@@ -178,18 +189,22 @@ void pool_compute_ref(const operators::PoolParam& param) {
                 int bh = kernel_h;
                 int bw = kernel_w;
                 if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                   bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                     bw += pad_w;
                   }
                 }
                 if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                   bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                     bh += pad_h;
                   }
                 }
@@ -225,75 +240,92 @@ TEST(pool_arm, compute) {
         for (auto exclusive : {true, false}) {
           for (auto ksize : {2, 3}) {
             for (auto stride : {1, 2}) {
-              for (auto pad : {0, 1}) {
-                for (auto n : {1, 2}) {
-                  for (auto c : {1, 3}) {
+              for (auto pad_left : {0, 1}) {
+                for (auto pad_right : {0, 1}) {
+                  for (auto pad_top : {0, 1}) {
+                    for (auto pad_bottom : {0, 1}) {
+                      for (auto n : {1, 2}) {
+                        for (auto c : {1, 3}) {
 #if 1
-                    for (auto h : {2, 3, 4, 11}) {
-                      for (auto w : {2, 3, 4, 11}) {
+                          for (auto h : {2, 3, 4, 11}) {
+                            for (auto w : {2, 3, 4, 11}) {
 #else
-                    for (int h = 2; h < 25; h++) {
-                      for (int w = 2; w < 25; w++) {
+                          for (int h = 2; h < 25; h++) {
+                            for (int w = 2; w < 25; w++) {
 #endif
-                        VLOG(3) << "n:" << n << " c:" << c << " h:" << h
-                                << " w:" << w << " ksize:" << ksize
-                                << " stride:" << stride << " pad:" << pad
-                                << " exclusive:" << exclusive
-                                << " global_pooling:" << global_pooling
-                                << " ceil_mode: " << ceil_mode
-                                << " pooling_type:" << pooling_type;
+                              VLOG(3) << "n:" << n << " c:" << c << " h:" << h
+                                      << " w:" << w << " ksize:" << ksize
+                                      << " stride:" << stride
+                                      << " pad_left:" << pad_left
+                                      << " pad_right:" << pad_right
+                                      << " pad_top:" << pad_top
+                                      << " pad_bottom:" << pad_bottom
+                                      << " exclusive:" << exclusive
+                                      << " global_pooling:" << global_pooling
+                                      << " ceil_mode: " << ceil_mode
+                                      << " pooling_type:" << pooling_type;
 
-                        // init x, output
-                        x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-                        auto* x_data = x.mutable_data<float>();
-                        for (int i = 0; i < x.dims().production(); ++i) {
-                          float sign = i % 3 == 0 ? -0.03 : 0.05f;
-                          x_data[i] = sign * (i % 128);
-                        }
+                              // init x, output
+                              x.Resize(
+                                  DDim(std::vector<int64_t>({n, c, h, w})));
+                              auto* x_data = x.mutable_data<float>();
+                              for (int i = 0; i < x.dims().production(); ++i) {
+                                float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                                x_data[i] = sign * (i % 128);
+                              }
 
-                        // fill param
-                        param.x = &x;
-                        param.output = &output;
-                        param.pooling_type = pooling_type;
-                        if (global_pooling) {
-                          param.ksize = {h, w};
-                        } else {
-                          param.ksize = {ksize, ksize};
-                        }
-                        param.global_pooling = global_pooling;
-                        param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
-                        param.exclusive = exclusive;
-                        param.ceil_mode = ceil_mode;
-                        param.adaptive = false;
-                        param.use_quantizer = false;
+                              // fill param
+                              param.x = &x;
+                              param.output = &output;
+                              param.pooling_type = pooling_type;
+                              if (global_pooling) {
+                                param.ksize = {h, w};
+                              } else {
+                                param.ksize = {ksize, ksize};
+                              }
+                              param.global_pooling = global_pooling;
+                              param.strides = {stride, stride};
+                              std::vector<int> paddings = {
+                                  pad_top, pad_bottom, pad_left, pad_right};
+                              param.exclusive = exclusive;
+                              param.paddings =
+                                  std::make_shared<std::vector<int>>(paddings);
+                              param.ceil_mode = ceil_mode;
+                              param.adaptive = false;
+                              param.use_quantizer = false;
 
-                        const std::vector<int64_t>& output_shape =
-                            compute_output_shape(&param);
-                        output.Resize(DDim(output_shape));
-                        output_ref.Resize(DDim(output_shape));
+                              const std::vector<int64_t>& output_shape =
+                                  compute_output_shape(&param);
+                              output.Resize(DDim(output_shape));
+                              output_ref.Resize(DDim(output_shape));
 
-                        auto* output_data = output.mutable_data<float>();
-                        auto* output_ref_data =
-                            output_ref.mutable_data<float>();
-                        for (int i = 0; i < output.dims().production(); ++i) {
-                          output_data[i] = -2;
-                          output_ref_data[i] = -2;
-                        }
+                              auto* output_data = output.mutable_data<float>();
+                              auto* output_ref_data =
+                                  output_ref.mutable_data<float>();
+                              for (int i = 0; i < output.dims().production();
+                                   ++i) {
+                                output_data[i] = -2;
+                                output_ref_data[i] = -2;
+                              }
 
-                        // compute
-                        pool.SetParam(param);
-                        pool.Run();
+                              // compute
+                              pool.SetParam(param);
+                              pool.Run();
 
-                        // compute ref
-                        param.output = &output_ref;
-                        pool_compute_ref(param);
+                              // compute ref
+                              param.output = &output_ref;
+                              pool_compute_ref(param);
 
-                        // compare
-                        for (int i = 0; i < output.dims().production(); i++) {
-                          EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
+                              // compare
+                              for (int i = 0; i < output.dims().production();
+                                   i++) {
+                                EXPECT_NEAR(
+                                    output_data[i], output_ref_data[i], 1e-4);
+                              }
+                              VLOG(3) << "compare pass";
+                            }
+                          }
                         }
-                        VLOG(3) << "compare pass";
                       }
                     }
                   }
diff --git a/lite/kernels/arm/reduce_prod_compute.cc b/lite/kernels/arm/reduce_prod_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..843be09891c6c2c799eb703687c5aa1b9401c644
--- /dev/null
+++ b/lite/kernels/arm/reduce_prod_compute.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/reduce_prod_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+template <typename T, PrecisionType Ptype>
+void ReduceProdCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<operators::ReduceParam>();
+  auto* input = param.x->template data<T>();
+  auto x_dims = param.x->dims();
+  int x_rank = x_dims.size();
+  auto* output = param.output->template mutable_data<T>();
+  std::vector<int> dim = param.dim;
+  bool keep_dim = param.keep_dim;
+  bool reduce_all = param.reduce_all;
+
+  if (!dim.empty()) {
+    for (int i = 0; i < dim.size(); i++) {
+      if (dim[i] < 0) {
+        dim[i] += x_rank;
+      }
+    }
+  }
+
+  if (reduce_all) {
+    lite::arm::math::reduce_prod_all(input, output, x_dims.production());
+  } else {
+    CHECK_EQ(x_rank, 4U);
+    int n_in = x_dims[0];
+    int c_in = x_dims[1];
+    int h_in = x_dims[2];
+    int w_in = x_dims[3];
+
+    if (dim.size() == 1) {
+      switch (dim[0]) {
+        case 0:
+          lite::arm::math::reduce_prod_n(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 1:
+          lite::arm::math::reduce_prod_c(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 2:
+          lite::arm::math::reduce_prod_h(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 3:
+          lite::arm::math::reduce_prod_w(input, output, n_in, c_in, h_in, w_in);
+          break;
+        default:
+          LOG(FATAL) << "dim[0] should be less than 4.";
+      }
+    } else if (dim.size() == 2) {
+      if (dim[0] == 0 && dim[1] == 1) {
+        lite::arm::math::reduce_prod_nc(input, output, n_in, c_in, h_in, w_in);
+      } else if (dim[0] == 1 && dim[1] == 2) {
+        lite::arm::math::reduce_prod_ch(input, output, n_in, c_in, h_in, w_in);
+      } else if (dim[0] == 2 && dim[1] == 3) {
+        lite::arm::math::reduce_prod_hw(input, output, n_in, c_in, h_in, w_in);
+      } else {
+        LOG(FATAL)
+            << "Only support the values of the dim are 0,1 1,2 or 2,3 for now.";
+      }
+    } else {
+      LOG(FATAL) << "dim's size over than 2, which is not supported now!!";
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using reduce_prob_arm_int32 =
+    paddle::lite::kernels::arm::ReduceProdCompute<int, PRECISION(kInt32)>;
+using reduce_prob_arm_float =
+    paddle::lite::kernels::arm::ReduceProdCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(
+    reduce_prod, kARM, kInt32, kNCHW, reduce_prob_arm_int32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    reduce_prod, kARM, kFloat, kNCHW, reduce_prob_arm_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/arm/reduce_prod_compute.h b/lite/kernels/arm/reduce_prod_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f13e2cdd12c9fdc76f004f54a5a9fd0c06eb3ef
--- /dev/null
+++ b/lite/kernels/arm/reduce_prod_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+#include "lite/backends/arm/math/type_trans.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename T, PrecisionType Ptype>
+class ReduceProdCompute : public KernelLite<TARGET(kARM), Ptype> {
+ public:
+  void Run() override;
+
+  virtual ~ReduceProdCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/shape_compute.cc b/lite/kernels/arm/shape_compute.cc
index f382004d7af3c45bb6c2b6ce952738e21c048238..3928e845023dd10c66704e1d752d2e5d2d7a5aff 100644
--- a/lite/kernels/arm/shape_compute.cc
+++ b/lite/kernels/arm/shape_compute.cc
@@ -37,5 +37,5 @@ void ShapeCompute::Run() {
 REGISTER_LITE_KERNEL(
     shape, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ShapeCompute, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .Finalize();
diff --git a/lite/kernels/arm/slice_compute.cc b/lite/kernels/arm/slice_compute.cc
index dc9c5d3da0bba40eaf1c2456a15f8e3ef566dbf3..05f48917aae105decc8bbb8e86fbef501d7fb8be 100644
--- a/lite/kernels/arm/slice_compute.cc
+++ b/lite/kernels/arm/slice_compute.cc
@@ -42,11 +42,10 @@ inline std::vector<int32_t> get_new_data_from_tensor(
   return vec_new_data;
 }
 
-void SliceCompute::PrepareForRun() {}
-
-void SliceCompute::Run() {
+template <typename T, PrecisionType PType>
+void SliceCompute<T, PType>::Run() {
   auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::SliceParam>();
+  auto& param = this->template Param<operators::SliceParam>();
 
   auto in = param.X;
   auto in_dims = in->dims();
@@ -156,8 +155,8 @@ void SliceCompute::Run() {
   }
 
   auto new_out_dims = out->dims();
-  const auto* x_data = in->data<int>();
-  auto* o_data = out->mutable_data<int>();
+  const auto* x_data = in->template data<T>();
+  auto* o_data = out->template mutable_data<T>();
   lite::arm::math::slice(
       x_data, in_dims.data(), axes, starts, ends, o_data, &ctx);
 }
@@ -167,8 +166,9 @@ void SliceCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    slice, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SliceCompute, def)
+using slice_float =
+    paddle::lite::kernels::arm::SliceCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_float, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("StartsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("EndsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
@@ -176,3 +176,14 @@ REGISTER_LITE_KERNEL(
     .BindInput("EndsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+using slice_int32 =
+    paddle::lite::kernels::arm::SliceCompute<int, PRECISION(kInt32)>;
+REGISTER_LITE_KERNEL(slice, kARM, kInt32, kNCHW, slice_int32, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("StartsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("EndsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("StartsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("EndsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .Finalize();
diff --git a/lite/kernels/arm/slice_compute.h b/lite/kernels/arm/slice_compute.h
index 701dacfbb4d011814805f4111eee89dcadfefe80..152a3284b83f64ce20810444aab2634b32a16c97 100644
--- a/lite/kernels/arm/slice_compute.h
+++ b/lite/kernels/arm/slice_compute.h
@@ -22,12 +22,12 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {
-class SliceCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+
+template <typename T, PrecisionType PType>
+class SliceCompute : public KernelLite<TARGET(kARM), PType> {
  public:
   using param_t = operators::SliceParam;
 
-  void PrepareForRun() override;
-
   void Run() override;
 
   ~SliceCompute() {}
diff --git a/lite/kernels/arm/split_compute.cc b/lite/kernels/arm/split_compute.cc
index 27606e2d76dfd13161fffc3f53d614155f62254e..21f07daf7d5712e4b40e148130f0149555e7b851 100644
--- a/lite/kernels/arm/split_compute.cc
+++ b/lite/kernels/arm/split_compute.cc
@@ -31,6 +31,9 @@ void SplitCompute::Run() {
   for (int i = in_dim.size() - 2; i >= 0; --i) {
     in_strides[i] = in_strides[i + 1] * in_dim[i];
   }
+  for (auto out : dout) {
+    out->set_lod(param.x->lod());
+  }
   lite::arm::math::split(din, dout, param.axis, in_strides);
 }
 
@@ -42,5 +45,9 @@ void SplitCompute::Run() {
 REGISTER_LITE_KERNEL(
     split, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SplitCompute, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SectionsTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/split_lod_tensor_compute.cc b/lite/kernels/arm/split_lod_tensor_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8bb5e4ae6b182a8f02e5d72ca763ca0fb0d4122f
--- /dev/null
+++ b/lite/kernels/arm/split_lod_tensor_compute.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/split_lod_tensor_compute.h"
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+void SplitLodTensorCompute::Run() {
+  auto &param = Param<operators::SplitLodTensorParam>();
+  const lite::Tensor *x = param.x;
+  const lite::Tensor *mask = param.mask;
+  lite::Tensor *out_true = param.out_true;
+  lite::Tensor *out_false = param.out_false;
+  int level = param.level;
+
+  auto &x_lod = x->lod();
+  auto &mask_dim = mask->dims();
+  auto *mask_data = mask->data<bool>();
+
+  std::vector<std::vector<CopyRange>> copy_ranges(2);
+  // set out_true/out_false lod
+  for (size_t t = 0; t < 2; t++) {
+    LoD *lod = nullptr;
+    if (t == 0) {
+      lod = out_false->mutable_lod();
+    } else {
+      lod = out_true->mutable_lod();
+    }
+    lod->clear();
+    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+      // VLOG(4) << "mask: " << mask_data[i];
+      if (static_cast<size_t>(mask_data[i]) == t) {
+        size_t start_idx = i;
+        auto lod_and_offset = lite::arm::math::GetSubLoDAndAbsoluteOffset(
+            x_lod, start_idx, start_idx + 1, level);
+
+        auto &lod_length = lod_and_offset.first;
+        lite::arm::math::AppendLoD(lod, lod_length);
+
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+      }
+    }
+  }
+
+  for (size_t t = 0; t < 2; ++t) {
+    Tensor *out;
+    if (t == 0) {
+      out = out_false;
+    } else {
+      out = out_true;
+    }
+    auto &ranges = copy_ranges[t];
+    size_t height = std::accumulate(
+        ranges.begin(), ranges.end(), 0UL, [](size_t a, const CopyRange &b) {
+          return a + b.end - b.begin;
+        });
+    if (height == 0) {
+      out->clear();
+      continue;
+    }
+    auto x_dim = x->dims();
+    x_dim[0] = static_cast<int64_t>(height);
+    out->Resize(x_dim);
+    auto *x_data = x->data<float>();
+    auto *out_data = out->mutable_data<float>();
+    auto out_dim = out->dims();
+    size_t base_num = static_cast<size_t>(out->numel() / out_dim[0]);
+    size_t offset = 0;
+    for (auto &each_range : ranges) {
+      size_t len = each_range.end - each_range.begin;
+      if (len == 0) {
+        continue;
+      }
+
+      auto *x_from = x_data + base_num * each_range.begin;
+      auto *out_dest = out_data + base_num * offset;
+      size_t copy_num = base_num * len * sizeof(float);
+      memcpy(out_dest, x_from, copy_num);
+      offset += len;
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(split_lod_tensor,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::SplitLodTensorCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Mask", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .BindOutput("OutTrue", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("OutFalse", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/split_lod_tensor_compute.h b/lite/kernels/arm/split_lod_tensor_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fae31165721cb6da9bdf553ab0409a53814a749
--- /dev/null
+++ b/lite/kernels/arm/split_lod_tensor_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/split_lod_tensor_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class SplitLodTensorCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SplitLodTensorParam;
+
+  void Run() override;
+
+  virtual ~SplitLodTensorCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/split_lod_tensor_compute_test.cc b/lite/kernels/arm/split_lod_tensor_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b2004c786698b70b4c54b68d696a9cf5f5221fd
--- /dev/null
+++ b/lite/kernels/arm/split_lod_tensor_compute_test.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/split_lod_tensor_compute.h"
+#include <gtest/gtest.h>
+#include <cstdlib>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+TEST(split_lod_tensor_arm, retrive_op) {
+  auto kernel =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "split_lod_tensor");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(split_lod_tensor_arm, init) {
+  SplitLodTensorCompute cpt;
+  ASSERT_EQ(cpt.precision(), PRECISION(kFloat));
+  ASSERT_EQ(cpt.target(), TARGET(kARM));
+}
+
+TEST(split_lod_tensor_arm_0, compute) {
+  DeviceInfo::Init();
+  Tensor x;
+  Tensor mask;
+  Tensor out_true;
+  Tensor out_false;
+  int level = 0;
+
+  // set dims and lod
+  VLOG(5) << "set dims and lod";
+  x.Resize({5, 1});
+  LoD x_lod;
+  std::vector<uint64_t> x_lod0 = {0, 2, 3, 5};
+  x_lod.push_back(x_lod0);
+  x.set_lod(x_lod);
+  mask.Resize({3, 1});
+  out_true.Resize({5, 1});
+  out_false.Resize({5, 1});
+
+  // initialize data
+  VLOG(5) << "initialize data";
+  auto* x_data = x.mutable_data<float>();
+  for (size_t i = 0; i < x.numel(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  auto* mask_data = mask.mutable_data<bool>();
+  for (size_t i = 0; i < mask.numel(); i++) {
+    mask_data[i] = static_cast<bool>(i % 2);
+  }
+
+  // prepare kernel params and run to obtain output_data
+  VLOG(5) << "prepare kernel params";
+  SplitLodTensorCompute op;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+  op.SetContext(std::move(ctx));
+
+  VLOG(5) << "run kernel";
+  operators::SplitLodTensorParam param;
+  param.x = &x;
+  param.mask = &mask;
+  param.out_true = &out_true;
+  param.out_false = &out_false;
+  param.level = level;
+  op.SetParam(param);
+  op.Launch();
+
+  VLOG(5) << "obtain results";
+  auto* out_true_data = out_true.data<float>();
+  std::vector<float> out_true_ref = {2};
+  for (int i = 0; i < out_true.numel(); i++) {
+    LOG(INFO) << out_true_data[i];
+    EXPECT_NEAR(out_true_data[i], out_true_ref[i], 1e-5);
+  }
+  auto* out_false_data = out_false.data<float>();
+  std::vector<float> out_false_ref = {0, 1, 3, 4};
+  for (int i = 0; i < out_false.numel(); i++) {
+    LOG(INFO) << out_false_data[i];
+    EXPECT_NEAR(out_false_data[i], out_false_ref[i], 1e-5);
+  }
+}
+TEST(split_lod_tensor_arm_1, compute) {
+  DeviceInfo::Init();
+  Tensor x;
+  Tensor mask;
+  Tensor out_true;
+  Tensor out_false;
+  int level = 0;
+
+  // set dims and lod
+  x.Resize({9, 3});
+  LoD x_lod;
+  std::vector<uint64_t> x_lod0 = {0, 2, 3, 5};
+  std::vector<uint64_t> x_lod1 = {0, 1, 3, 6, 8, 9};
+  x_lod.push_back(x_lod0);
+  x_lod.push_back(x_lod1);
+  x.set_lod(x_lod);
+  mask.Resize({3, 1});
+  out_true.Resize({9, 2});
+  out_false.Resize({9, 2});
+
+  // initialize data
+  auto* x_data = x.mutable_data<float>();
+  for (size_t i = 0; i < x.numel(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  auto* mask_data = mask.mutable_data<bool>();
+  for (size_t i = 0; i < mask.numel(); i++) {
+    mask_data[i] = static_cast<bool>(i % 2);
+  }
+
+  // prepare kernel params and run to obtain output_data
+  SplitLodTensorCompute op;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+  op.SetContext(std::move(ctx));
+
+  operators::SplitLodTensorParam param;
+  param.x = &x;
+  param.mask = &mask;
+  param.out_true = &out_true;
+  param.out_false = &out_false;
+  param.level = level;
+  op.SetParam(param);
+  op.Launch();
+
+  auto* out_true_data = out_true.data<float>();
+  std::vector<float> out_true_ref = {9, 10, 11, 12, 13, 14, 15, 16, 17};
+  for (int i = 0; i < out_true.numel(); i++) {
+    EXPECT_NEAR(out_true_data[i], out_true_ref[i], 1e-5);
+  }
+  auto* out_false_data = out_false.data<float>();
+  std::vector<float> out_false_ref = {
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 18, 19, 20, 21, 22, 23, 24, 25, 26};
+  for (int i = 0; i < out_false.numel(); i++) {
+    EXPECT_NEAR(out_false_data[i], out_false_ref[i], 1e-5);
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(split_lod_tensor, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/unsqueeze_compute.cc b/lite/kernels/arm/unsqueeze_compute.cc
index e623407c2e718a51b51e880a4d81df4ee0d96f87..91c8c0423b6fcc5bade5751985f190b3395b0779 100644
--- a/lite/kernels/arm/unsqueeze_compute.cc
+++ b/lite/kernels/arm/unsqueeze_compute.cc
@@ -54,12 +54,12 @@ REGISTER_LITE_KERNEL(unsqueeze,
                      kNCHW,
                      paddle::lite::kernels::host::UnsqueezeCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindInput("AxesTensor",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("AxesTensorList",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(unsqueeze2,
@@ -68,11 +68,11 @@ REGISTER_LITE_KERNEL(unsqueeze2,
                      kNCHW,
                      paddle::lite::kernels::host::Unsqueeze2Compute,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindInput("AxesTensor",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("AxesTensorList",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/while_compute.h b/lite/kernels/arm/while_compute.h
index c8ddc6086c0dbbcb855306e8ce5c268ff2ed8cf8..f735d96f9190755daacdf846a2d99901c1a14493 100644
--- a/lite/kernels/arm/while_compute.h
+++ b/lite/kernels/arm/while_compute.h
@@ -36,7 +36,7 @@ class StepExecutor {
       auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
       auto op_type = op_desc.Type();
       auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
-      VLOG(4) << "while: creating Op [" << op_type << "]";
+      // VLOG(4) << "while: creating Op [" << op_type << "]";
       op_handler->Attach(op_desc, scope);
 
       auto hostplace = place_;
@@ -51,9 +51,9 @@ class StepExecutor {
 
   void Run() {
     for (auto &op_handler : ops_of_block_) {
-      VLOG(4) << op_handler->op_info()->Repr();
+      // VLOG(4) << op_handler->op_info()->Repr();
       op_handler->InferShape();
-      VLOG(4) << "while: infered shape";
+      // VLOG(4) << "while: infered shape";
       op_handler->Run();
     }
   }
diff --git a/lite/kernels/arm/write_to_array_compute.cc b/lite/kernels/arm/write_to_array_compute.cc
index ee68442ffcd0a5c12f3659e0739715c2128ece28..a394c28a698c278dea7ded51ae016b777d2a0971 100644
--- a/lite/kernels/arm/write_to_array_compute.cc
+++ b/lite/kernels/arm/write_to_array_compute.cc
@@ -20,28 +20,37 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void WriteToArrayCompute::PrepareForRun() {}
-
 void WriteToArrayCompute::Run() {
   auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::WriteToArrayParam>();
-
+  auto& param = this->template Param<operators::WriteToArrayParam>();
   CHECK_EQ(param.I->numel(), 1) << "input2 should have only one element";
-  const auto* x_data = param.X->data<float>();
-  int id = param.I->data<float>()[0];
-  int id_test = param.I->data<int64_t>()[0];
-  if (id >= param.Out->size()) {
-    for (int i = param.Out->size(); i < id + 1; i++) {
-      lite::Tensor tmp;
-      param.Out->push_back(tmp);
-    }
+  auto precision_type = param.X->precision();
+
+#define SOLVE_TYPE(type__, T)                                       \
+  case type__: {                                                    \
+    const auto* x_data = param.X->data<T>();                        \
+    int id = param.I->data<int64_t>()[0];                           \
+    if (id >= param.Out->size()) {                                  \
+      for (int i = param.Out->size(); i < id + 1; i++) {            \
+        lite::Tensor tmp;                                           \
+        param.Out->push_back(tmp);                                  \
+      }                                                             \
+    }                                                               \
+    (*param.Out)[id].Resize(param.X->dims());                       \
+    auto out_lod = (*param.Out)[id].mutable_lod();                  \
+    *out_lod = param.X->lod();                                      \
+    auto* o_data = (*param.Out)[id].mutable_data<T>(TARGET(kHost)); \
+    int input_size = param.X->numel();                              \
+    memcpy(o_data, x_data, sizeof(T) * input_size);                 \
+  } break;
+
+  switch (precision_type) {
+    SOLVE_TYPE(PRECISION(kFloat), float);
+    SOLVE_TYPE(PRECISION(kInt64), int64_t);
+    default:
+      LOG(FATAL) << "Unsupported precision type.";
   }
-  (*param.Out)[id].Resize(param.X->dims());
-  auto out_lod = (*param.Out)[id].mutable_lod();
-  *out_lod = param.X->lod();
-  auto* o_data = (*param.Out)[id].mutable_data<float>(TARGET(kHost));
-  int input_size = param.X->numel();
-  memcpy(o_data, x_data, sizeof(float) * input_size);
+#undef SOLVE_TYPE
 }
 
 }  // namespace arm
@@ -51,11 +60,11 @@ void WriteToArrayCompute::Run() {
 
 REGISTER_LITE_KERNEL(write_to_array,
                      kARM,
-                     kFloat,
+                     kAny,
                      kNCHW,
                      paddle::lite::kernels::arm::WriteToArrayCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorListTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/write_to_array_compute.h b/lite/kernels/arm/write_to_array_compute.h
index c7b7c64c341fb16188aea1a166a93ffe7a78ecb7..8235f9dae3fec639312f12faf08e764e79ab0bd5 100644
--- a/lite/kernels/arm/write_to_array_compute.h
+++ b/lite/kernels/arm/write_to_array_compute.h
@@ -23,12 +23,10 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class WriteToArrayCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class WriteToArrayCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::WriteToArrayParam;
 
-  void PrepareForRun() override;
-
   void Run() override;
 
   ~WriteToArrayCompute() {}
diff --git a/lite/kernels/arm/yolo_box_compute.cc b/lite/kernels/arm/yolo_box_compute.cc
index 1336e5e1e0a6438a08f542d299eddc30d15dad15..38443bf27726ee879b38e3058c8d3a48df975baf 100644
--- a/lite/kernels/arm/yolo_box_compute.cc
+++ b/lite/kernels/arm/yolo_box_compute.cc
@@ -32,6 +32,8 @@ void YoloBoxCompute::Run() {
   int class_num = param.class_num;
   float conf_thresh = param.conf_thresh;
   int downsample_ratio = param.downsample_ratio;
+  Boxes->clear();
+  Scores->clear();
   lite::arm::math::yolobox(X,
                            ImgSize,
                            Boxes,
@@ -54,7 +56,8 @@ REGISTER_LITE_KERNEL(yolo_box,
                      paddle::lite::kernels::arm::YoloBoxCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("ImgSize", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("ImgSize",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/bm/CMakeLists.txt b/lite/kernels/bm/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..691fe55978a211b32f804d91f70efb6f3f27f303
--- /dev/null
+++ b/lite/kernels/bm/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(NOT LITE_WITH_BM)
+  return ()
+endif()
+
+add_subdirectory(bridges)
+add_kernel(subgraph_compute_bm BM basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${bm_subgraph_bridges})
diff --git a/lite/kernels/bm/bridges/CMakeLists.txt b/lite/kernels/bm/bridges/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..688e307a6475073461415da2ca1c8f2cc6c88aac
--- /dev/null
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -0,0 +1,42 @@
+if(NOT LITE_WITH_BM)
+  return()
+endif()
+
+lite_cc_library(subgraph_bridge_utility_bm SRCS utility.cc DEPS)
+lite_cc_library(subgraph_bridge_graph_bm SRCS graph.cc DEPS subgraph_bridge_utility_bm)
+
+set(bm_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_bm subgraph_bridge_graph_bm)
+
+lite_cc_library(subgraph_bridge_act_op_bm SRCS act_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_op_bm SRCS conv_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_bm SRCS elementwise_ops.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_bm SRCS pool_op.cc DEPS ${subgraph_bridge_deps_bm})
+lite_cc_library(subgraph_bridge_softmax_op_bm SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_bm})
+lite_cc_library(subgraph_bridge_mul_op_bm SRCS mul_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_bm SRCS batch_norm_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_scale_op_bm SRCS scale_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_bm SRCS concat_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_dropout_op_bm SRCS dropout_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_transpose_op_bm SRCS transpose_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_reshape_op_bm SRCS reshape_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_norm_op_bm SRCS norm_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_prior_box_op_bm SRCS prior_box_op.cc DEPS ${bm_subgraph_bridge_deps})
+set(bm_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_engine
+        subgraph_bridge_graph_bm
+        subgraph_bridge_act_op_bm
+        subgraph_bridge_conv_op_bm
+        subgraph_bridge_elementwise_ops_bm
+        subgraph_bridge_pool_op_bm
+        subgraph_bridge_softmax_op_bm
+        subgraph_bridge_mul_op_bm
+        subgraph_bridge_batch_norm_op_bm
+        subgraph_bridge_scale_op_bm
+        subgraph_bridge_concat_op_bm
+        subgraph_bridge_dropout_op_bm
+        subgraph_bridge_transpose_op_bm
+        subgraph_bridge_reshape_op_bm
+        subgraph_bridge_norm_op_bm
+        subgraph_bridge_prior_box_op_bm
+        CACHE INTERNAL "bm_subgraph_bridges")
diff --git a/lite/kernels/bm/bridges/act_op.cc b/lite/kernels/bm/bridges/act_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d3c4e0b83598358958ae670e554949deb7d1926
--- /dev/null
+++ b/lite/kernels/bm/bridges/act_op.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  float alpha = 0.f;
+  if (op_type == "relu") {
+  } else if (op_type == "leaky_relu") {
+    alpha = op_info->GetAttr<float>("alpha");
+  } else {
+    LOG(FATAL) << "[BM] unsupport act type";
+    return FAILED;
+  }
+  add_relu_layer(graph->GetCompilerHandle(),
+                 const_cast<const int*>(&i_x_shape_data[0]),
+                 x_dims.size(),
+                 static_cast<const char*>(x_var_name.c_str()),
+                 const_cast<const int*>(&i_output_shape_data[0]),
+                 output_dims.size(),
+                 static_cast<const char*>(output_var_name.c_str()),
+                 alpha,
+                 -1.f);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(relu, kBM, paddle::lite::subgraph::bm::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
+                         kBM,
+                         paddle::lite::subgraph::bm::ActConverter);
diff --git a/lite/kernels/bm/bridges/batch_norm_op.cc b/lite/kernels/bm/bridges/batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fbf70178fdd971edce34b3253b02febfa3e3b85c
--- /dev/null
+++ b/lite/kernels/bm/bridges/batch_norm_op.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  int channel_size = x_dims[1];
+  auto scale_var_name = op_info->Input("Scale").front();
+  auto scale = scope->FindVar(scale_var_name)->GetMutable<lite::Tensor>();
+  auto bias_var_name = op_info->Input("Bias").front();
+  auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+  auto mean_var_name = op_info->Input("Mean").front();
+  auto mean = scope->FindVar(mean_var_name)->GetMutable<lite::Tensor>();
+  auto variance_var_name = op_info->Input("Variance").front();
+  auto variance = scope->FindVar(variance_var_name)->GetMutable<lite::Tensor>();
+  // output
+  auto output_var_name = op_info->Output("Y").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+  auto unique_bn_out_name = lite::subgraph::bm::UniqueName("batch_norm_out");
+  auto* scale_data = scale->mutable_data<float>();
+  auto* bias_data = bias->mutable_data<float>();
+  auto* mean_data = mean->mutable_data<float>();
+  auto* variance_data = variance->mutable_data<float>();
+  for (int c = 0; c < channel_size; c++) {
+    float inv_scale = 1.f / (std::sqrt(variance_data[c] + epsilon));
+    bias_data[c] = bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
+    scale_data[c] = inv_scale * scale_data[c];
+  }
+
+  const int input_num = 1;
+  int** shape = new int*[input_num];
+  int* dim = new int[input_num];
+  const char** name = new const char*[input_num];
+  name[0] = static_cast<const char*>(x_var_name.c_str());
+  dim[0] = x_dims.size();
+  shape[0] = &i_x_shape_data[0];
+  add_scale_layer(graph->GetCompilerHandle(),
+                  input_num,
+                  shape,
+                  dim,
+                  name,
+                  const_cast<const int*>(&i_output_shape_data[0]),
+                  output_dims.size(),
+                  static_cast<const char*>(output_var_name.c_str()),
+                  static_cast<const char*>(unique_op_name.c_str()),
+                  static_cast<const float*>(scale->mutable_data<float>()),
+                  static_cast<const float*>(bias->mutable_data<float>()),
+                  1,
+                  1,
+                  1);
+  delete[] shape;
+  delete[] name;
+  delete[] dim;
+
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kBM,
+                         paddle::lite::subgraph::bm::BatchNormConverter);
diff --git a/lite/kernels/bm/bridges/concat_op.cc b/lite/kernels/bm/bridges/concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b568aa4d161b5af8d17a83cdedddc446fcd8237
--- /dev/null
+++ b/lite/kernels/bm/bridges/concat_op.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_names = op_info->Input("X");
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  const int32_t input_num = x_names.size();
+  int32_t** shape = new int32_t*[input_num];
+  int32_t* dim = new int32_t[input_num];
+  const char** name = new const char*[input_num];
+  for (size_t i = 0; i < x_names.size(); i++) {
+    auto x = scope->FindMutableTensor(x_names[i]);
+    name[i] = x_names[i].c_str();
+    auto x_dims = x->dims();
+    dim[i] = x_dims.size();
+    const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+    shape[i] = new int32_t[x_dims.size()];
+    for (size_t j = 0; j < x_dims.size(); j++) {
+      shape[i][j] = static_cast<int32_t>(x_shape_data[j]);
+    }
+  }
+
+  auto axis = op_info->GetAttr<int>("axis");
+  add_concat_layer(graph->GetCompilerHandle(),
+                   input_num,
+                   shape,
+                   dim,
+                   name,
+                   const_cast<const int*>(&i_output_shape_data[0]),
+                   output_dims.size(),
+                   static_cast<const char*>(output_var_name.c_str()),
+                   axis);
+  for (size_t i = 0; i < x_names.size(); i++) {
+    delete[] shape[i];
+  }
+  delete[] shape;
+  delete[] name;
+  delete[] dim;
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConcatConverter);
diff --git a/lite/kernels/bm/bridges/conv_op.cc b/lite/kernels/bm/bridges/conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ffe5a59aca8124a0f7999a71b35947d11e37b4fe
--- /dev/null
+++ b/lite/kernels/bm/bridges/conv_op.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  auto input_var_name = op_info->Input("Input").front();
+  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input_dims = input->dims();
+  auto output_var_name = op_info->Output("Output").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  auto filter_var_name = op_info->Input("Filter").front();
+  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+  auto filter_dims = filter->dims();
+  CHECK_EQ(input_dims.size(), 4);
+  CHECK_EQ(output_dims.size(), 4);
+  CHECK_EQ(filter_dims.size(), 4);
+  bool has_bias = lite::subgraph::bm::HasInputArg(op_info, scope, "Bias");
+  float* bias_data = nullptr;
+  if (has_bias) {
+    auto bias_var_name = op_info->Input("Bias").front();
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    bias_data = static_cast<float*>(bias->mutable_data<float>());
+  }
+  const int64_t* input_shape_data =
+      const_cast<const int64_t*>(&input_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_input_shape_data(input_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+
+  for (size_t i = 0; i < input_dims.size(); i++) {
+    i_input_shape_data[i] = static_cast<int32_t>(input_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(output_shape_data[i]);
+  }
+  const float* filter_data =
+      const_cast<const float*>(filter->mutable_data<float>());
+  auto groups = op_info->GetAttr<int>("groups");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  add_conv_layer(graph->GetCompilerHandle(),
+                 const_cast<const int*>(&i_input_shape_data[0]),
+                 input_dims.size(),
+                 static_cast<const char*>(input_var_name.c_str()),
+                 const_cast<const int*>(&i_output_shape_data[0]),
+                 output_dims.size(),
+                 static_cast<const char*>(output_var_name.c_str()),
+                 static_cast<const char*>(unique_op_name.c_str()),
+                 filter_data,
+                 bias_data,
+                 filter_dims.data()[2],
+                 filter_dims.data()[3],
+                 groups,
+                 paddings[0],
+                 paddings[0],
+                 paddings[1],
+                 paddings[1],
+                 strides[0],
+                 strides[1],
+                 dilations[0],
+                 dilations[1],
+                 static_cast<int>(has_bias));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConvConverter);
diff --git a/lite/kernels/bm/bridges/dropout_op.cc b/lite/kernels/bm/bridges/dropout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3364e866a3525c225916179152669d6456a42efc
--- /dev/null
+++ b/lite/kernels/bm/bridges/dropout_op.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
+  auto dropout_implementation =
+      op_info->GetAttr<std::string>("dropout_implementation");
+  CHECK_EQ(dropout_implementation, "downgrade_in_infer");
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         1.f - dropout_prob,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         BINARY_MUL,
+                         0);
+
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(dropout,
+                         kBM,
+                         paddle::lite::subgraph::bm::DropoutConverter);
diff --git a/lite/kernels/bm/bridges/elementwise_ops.cc b/lite/kernels/bm/bridges/elementwise_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2fdbfd8c3f74879a52f5d3a8057953ab800887ef
--- /dev/null
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  const int input_num = 2;
+  int** shape = new int*[input_num];
+  int* dim = new int[input_num];
+  const char** name = new const char*[input_num];
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  name[0] = static_cast<const char*>(x_var_name.c_str());
+  dim[0] = x_dims.size();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  shape[0] = &i_x_shape_data[0];
+  auto y_var_name = op_info->Input("Y").front();
+  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  auto y_dims = y->dims();
+  name[1] = static_cast<const char*>(y_var_name.c_str());
+  dim[1] = y_dims.size();
+  const int64_t* y_shape_data = const_cast<const int64_t*>(&y_dims.data()[0]);
+  std::vector<int32_t> i_y_shape_data(y_dims.size());
+  for (size_t i = 0; i < y_dims.size(); i++) {
+    i_y_shape_data[i] = static_cast<int>(y_shape_data[i]);
+  }
+  shape[1] = &i_y_shape_data[0];
+  bool y_is_const = !graph->HasNode(y_var_name);
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto axis = op_info->GetAttr<int>("axis");
+  int op_code{-1};
+  int eltwise_if_code{-1};
+  float coeff[2] = {1.f, 1.f};
+  if (op_type == "elementwise_mul") {
+    op_code = BINARY_MUL;
+    eltwise_if_code = 0;
+  } else if (op_type == "elementwise_add") {
+    op_code = BINARY_ADD;
+    eltwise_if_code = 1;
+  } else if (op_type == "elementwise_sub") {
+    op_code = BINARY_SUB;
+    eltwise_if_code = 1;
+    coeff[1] = -1.f;
+  } else {
+    LOG(FATAL) << "UNSUPPORTED ELTWISE OPERATION: " << op_type;
+  }
+  const float* y_data = const_cast<const float*>(y->mutable_data<float>());
+  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
+  auto unique_op_name = lite::subgraph::bm::UniqueName("expand_ndims");
+  std::vector<int32_t> i_expand_shape_data(3);
+  if (y_is_const) {
+    if (dim[0] == dim[1] || 2 == dim[0]) {
+      bm_add_const_tensor(graph->GetCompilerHandle(),
+                          name[1],
+                          shape[1],
+                          dim[1],
+                          static_cast<bm_data_type_t>(DTYPE_FP32),
+                          static_cast<const void*>(y_data));
+    } else if (1 == dim[1] && 1 == axis) {
+      add_expand_ndims_layer(graph->GetCompilerHandle(),
+                             name[1],
+                             shape[1],
+                             dim[1],
+                             static_cast<const float*>(y_data),
+                             -1,
+                             2,
+                             static_cast<const char*>(unique_op_name.c_str()));
+      name[1] = static_cast<const char*>(unique_op_name.c_str());
+      dim[1] = 3;
+      i_expand_shape_data[0] = i_y_shape_data[0];
+      i_expand_shape_data[1] = 1;
+      i_expand_shape_data[2] = 1;
+      shape[1] = &i_expand_shape_data[0];
+      y_data = nullptr;
+    }
+    add_binary_layer_v2(graph->GetCompilerHandle(),
+                        name[0],
+                        shape[0],
+                        dim[0],
+                        0,
+                        static_cast<const float*>(x_data),
+                        name[1],
+                        shape[1],
+                        dim[1],
+                        0,
+                        static_cast<const float*>(y_data),
+                        static_cast<const char*>(output_var_name.c_str()),
+                        op_code);
+  } else {
+    add_eltwise_layer(graph->GetCompilerHandle(),
+                      input_num,
+                      shape,
+                      dim,
+                      name,
+                      const_cast<const int*>(&i_output_shape_data[0]),
+                      output_dims.size(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      eltwise_if_code,
+                      coeff);
+  }
+  delete[] shape;
+  delete[] name;
+  delete[] dim;
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kBM,
+                         paddle::lite::subgraph::bm::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kBM,
+                         paddle::lite::subgraph::bm::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
+                         kBM,
+                         paddle::lite::subgraph::bm::ElementwiseConverter);
diff --git a/lite/kernels/bm/bridges/graph.cc b/lite/kernels/bm/bridges/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81dedb30c6d3965b0fcf416133c77d4a5f0a24d1
--- /dev/null
+++ b/lite/kernels/bm/bridges/graph.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/bm/bridges/graph.h"
+#include <bmcompiler_if.h>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+void Graph::AddNode(const std::string& name) {
+  nodes_.insert(std::make_pair(name, name));
+}
+
+void Graph::CreateCompilerHandle() {
+  compiler_handle_ = create_bmcompiler("BM1684");
+  CHECK(compiler_handle_ != nullptr);
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/bm/bridges/graph.h b/lite/kernels/bm/bridges/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..40dadcc92d44e8f3cf73dea63d4c7cf2899cda1f
--- /dev/null
+++ b/lite/kernels/bm/bridges/graph.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+// Graph to collect all of converted BM IR nodes
+class Graph {
+ public:
+  void AddNode(const std::string& name);
+  bool HasNode(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+  void CreateCompilerHandle();
+  void* GetCompilerHandle() { return compiler_handle_; }
+
+ private:
+  std::unordered_map<std::string, std::string> nodes_;
+  void* compiler_handle_;
+};
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/bm/bridges/mul_op.cc b/lite/kernels/bm/bridges/mul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06ec177bceb883758c42d45c9b07006a83b3c9f6
--- /dev/null
+++ b/lite/kernels/bm/bridges/mul_op.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // only support y is const
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // add reshape layer
+  int i_x_reshape_shape_data[2];
+  i_x_reshape_shape_data[0] = static_cast<int>(x_shape_data[0]);
+  i_x_reshape_shape_data[1] = 1;
+  for (size_t i = 1; i < x_dims.size(); i++) {
+    i_x_reshape_shape_data[1] *= static_cast<int>(x_shape_data[i]);
+  }
+  int reshape_param[] = {0, -1};
+  auto unique_op_reshape_name =
+      lite::subgraph::bm::UniqueName(op_type + "_reshape");
+  add_reshape_layer(graph->GetCompilerHandle(),
+                    const_cast<const int*>(&i_x_shape_data[0]),
+                    x_dims.size(),
+                    static_cast<const char*>(x_var_name.c_str()),
+                    const_cast<const int*>(&i_x_reshape_shape_data[0]),
+                    2,
+                    static_cast<const char*>(unique_op_reshape_name.c_str()),
+                    const_cast<const int*>(reshape_param));
+
+  auto y_var_name = op_info->Input("Y").front();
+  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  auto y_dims = y->dims();
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  add_fc_layer(graph->GetCompilerHandle(),
+               const_cast<const int*>(&i_x_reshape_shape_data[0]),
+               2,
+               static_cast<const char*>(unique_op_reshape_name.c_str()),
+               const_cast<const int*>(&i_output_shape_data[0]),
+               output_dims.size(),
+               static_cast<const char*>(output_var_name.c_str()),
+               static_cast<const char*>(unique_op_name.c_str()),
+               i_x_reshape_shape_data[1],
+               i_output_shape_data[1],
+               static_cast<const float*>(y->mutable_data<float>()),
+               nullptr,
+               0,
+               0);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(mul, kBM, paddle::lite::subgraph::bm::MulConverter);
diff --git a/lite/kernels/bm/bridges/norm_op.cc b/lite/kernels/bm/bridges/norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..69b2ac130dcd3252ffcc97ac24a3b5d65afff6f0
--- /dev/null
+++ b/lite/kernels/bm/bridges/norm_op.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int NormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+
+  float one = 1.f;
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+  add_normalize_layer(graph->GetCompilerHandle(),
+                      const_cast<const int*>(&i_x_shape_data[0]),
+                      x_dims.size(),
+                      static_cast<const char*>(x_var_name.c_str()),
+                      const_cast<const int*>(&i_output_shape_data[0]),
+                      output_dims.size(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      static_cast<const char*>(unique_op_name.c_str()),
+                      0,
+                      1,
+                      &one,
+                      epsilon);
+
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(norm, kBM, paddle::lite::subgraph::bm::NormConverter);
diff --git a/lite/kernels/bm/bridges/paddle_use_bridges.h b/lite/kernels/bm/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdaf70de6a4777ae016326a22721c845a79b7d93
--- /dev/null
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(relu, kBM);
+USE_SUBGRAPH_BRIDGE(leaky_relu, kBM);
+USE_SUBGRAPH_BRIDGE(conv2d, kBM);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kBM);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kBM);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kBM);
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kBM);
+USE_SUBGRAPH_BRIDGE(pool2d, kBM);
+USE_SUBGRAPH_BRIDGE(softmax, kBM);
+USE_SUBGRAPH_BRIDGE(mul, kBM);
+USE_SUBGRAPH_BRIDGE(batch_norm, kBM);
+USE_SUBGRAPH_BRIDGE(scale, kBM);
+USE_SUBGRAPH_BRIDGE(concat, kBM);
+USE_SUBGRAPH_BRIDGE(dropout, kBM);
+USE_SUBGRAPH_BRIDGE(transpose, kBM);
+USE_SUBGRAPH_BRIDGE(transpose2, kBM);
+USE_SUBGRAPH_BRIDGE(reshape, kBM);
+USE_SUBGRAPH_BRIDGE(reshape2, kBM);
+USE_SUBGRAPH_BRIDGE(flatten, kBM);
+USE_SUBGRAPH_BRIDGE(flatten2, kBM);
+USE_SUBGRAPH_BRIDGE(norm, kBM);
+USE_SUBGRAPH_BRIDGE(prior_box, kBM);
diff --git a/lite/kernels/bm/bridges/pool_op.cc b/lite/kernels/bm/bridges/pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cd48db5b726d1dcb3b65e4c3a70141a09d452bdc
--- /dev/null
+++ b/lite/kernels/bm/bridges/pool_op.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  int32_t* shape[1];
+  int32_t dim[1];
+  const char* name[1];
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  shape[0] = &i_output_shape_data[0];
+  name[0] = static_cast<const char*>(output_var_name.c_str());
+  dim[0] = output_dims.size();
+  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  CHECK(pooling_type == "max" || pooling_type == "avg");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
+  bool average_exclusive = false;
+  if (pooling_type == "avg") {
+    average_exclusive = op_info->GetAttr<bool>("exclusive");
+  }
+  if (global_pooling) {
+    paddings[0] = 0;
+    paddings[1] = 0;
+    ksize[0] = i_x_shape_data[2];
+    ksize[1] = i_x_shape_data[3];
+  }
+  add_pooling_layer(
+      graph->GetCompilerHandle(),
+      const_cast<const int*>(&i_x_shape_data[0]),
+      x_dims.size(),
+      static_cast<const char*>(x_var_name.c_str()),
+      1,
+      shape,
+      dim,
+      name,
+      ksize[0],
+      ksize[1],
+      paddings[0],
+      paddings[0],
+      paddings[1],
+      paddings[1],
+      strides[0],
+      strides[1],
+      (ksize[0] > 1 && ksize[1] > 1) && pooling_type == "max" ? 0 : 1,
+      static_cast<int>(average_exclusive),
+      static_cast<int>(global_pooling),
+      static_cast<int>(ceil_mode),
+      static_cast<const char*>(unique_op_name.c_str()),
+      nullptr);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kBM,
+                         paddle::lite::subgraph::bm::PoolConverter);
diff --git a/lite/kernels/bm/bridges/prior_box_op.cc b/lite/kernels/bm/bridges/prior_box_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17c3fbf03473a480e0bb736241e6095055999098
--- /dev/null
+++ b/lite/kernels/bm/bridges/prior_box_op.cc
@@ -0,0 +1,340 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+typedef struct __tag_st_priorbox_param {
+  std::vector<float> min_sizes;
+  std::vector<float> max_sizes;
+  std::vector<float> aspect_ratios;
+  std::vector<float> variances;
+  float step_w;
+  float step_h;
+  float offset;
+  int32_t img_w;
+  int32_t img_h;
+  int32_t prior_num;
+  bool min_max_aspect_ratios_order;
+  bool clip;
+  bool flip;
+} st_priorbox_param;
+
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>* output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior->push_back(ar);
+      if (flip) {
+        output_aspect_ratior->push_back(1.0f / ar);
+      }
+    }
+  }
+}
+
+float* compute_priorbox_kernel(OpLite* op, st_priorbox_param* param) {
+  auto op_info = op->op_info();
+  auto scope = op->scope();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  auto var = scope->FindVar(var_var_name)->GetMutable<lite::Tensor>();
+  std::vector<float> expand_aspect_ratios;
+  ExpandAspectRatios(param->aspect_ratios, param->flip, &expand_aspect_ratios);
+  param->aspect_ratios.clear();
+  for (size_t i = 0; i < expand_aspect_ratios.size(); i++) {
+    param->aspect_ratios.push_back(expand_aspect_ratios[i]);
+  }
+  param->prior_num = param->aspect_ratios.size() * param->min_sizes.size();
+  if (param->max_sizes.size() > 0) {
+    param->prior_num += param->max_sizes.size();
+  }
+  int32_t win1 = in_dims[3];
+  int32_t hin1 = in_dims[2];
+  DDim shape_out({hin1, win1, param->prior_num, 4});
+  boxes->Resize(shape_out);
+  var->Resize(shape_out);
+  // boxes->mutable_data<float>();
+  // var->mutable_data<float>();
+  float* cpu_data =
+      static_cast<float*>(malloc(sizeof(float) * boxes->data_size() * 2));
+  CHECK(cpu_data != nullptr);
+  const int32_t width = in_dims[3];
+  const int32_t height = in_dims[2];
+  int32_t img_width = param->img_w;
+  int32_t img_height = param->img_h;
+  if (img_width == 0 || img_height == 0) {
+    img_width = img_dims[3];
+    img_height = img_dims[2];
+  }
+  float step_w = param->step_w;
+  float step_h = param->step_h;
+  if (step_w == 0.f || step_h == 0.f) {
+    step_w = static_cast<float>(img_width) / width;
+    step_h = static_cast<float>(img_height) / height;
+  }
+  float offset = param->offset;
+  int32_t channel_size = height * width * param->prior_num * 4;
+  int32_t idx = 0;
+  ///////////////////////////////////////////////////////////////////////
+  for (int32_t h = 0; h < height; ++h) {
+    for (int32_t w = 0; w < width; ++w) {
+      float center_x = (w + offset) * step_w;
+      float center_y = (h + offset) * step_h;
+      float box_width = 0.f;
+      float box_height = 0.f;
+      float* min_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
+      float* max_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
+      float* com_buf = reinterpret_cast<float*>(
+          malloc(sizeof(float) * expand_aspect_ratios.size() * 4));
+      CHECK(min_buf != nullptr);
+      CHECK(max_buf != nullptr);
+      CHECK(com_buf != nullptr);
+      // LOG(INFO) << "the number of min_size is " << min_sizes_.size();
+      for (size_t s = 0; s < param->min_sizes.size(); ++s) {
+        int32_t min_idx = 0;
+        int32_t max_idx = 0;
+        int32_t com_idx = 0;
+        int32_t min_size = param->min_sizes[s];
+        //! first prior: aspect_ratio = 1, size = min_size
+        box_width = box_height = min_size;
+        //! xmin
+        min_buf[min_idx++] = (center_x - box_width / 2.f) / img_width;
+        //! ymin
+        min_buf[min_idx++] = (center_y - box_height / 2.f) / img_height;
+        //! xmax
+        min_buf[min_idx++] = (center_x + box_width / 2.f) / img_width;
+        //! ymax
+        min_buf[min_idx++] = (center_y + box_height / 2.f) / img_height;
+        if (param->max_sizes.size() > 0) {
+          int max_size = param->max_sizes[s];
+          //! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
+          box_width = box_height = sqrtf(min_size * max_size);
+          //! xmin
+          max_buf[max_idx++] = (center_x - box_width / 2.f) / img_width;
+          //! ymin
+          max_buf[max_idx++] = (center_y - box_height / 2.f) / img_height;
+          //! xmax
+          max_buf[max_idx++] = (center_x + box_width / 2.f) / img_width;
+          //! ymax
+          max_buf[max_idx++] = (center_y + box_height / 2.f) / img_height;
+        }
+        //! rest of priors
+        for (size_t r = 0; r < expand_aspect_ratios.size(); ++r) {
+          float ar = expand_aspect_ratios[r];
+          if (fabs(ar - 1.) < 1e-6) {
+            continue;
+          }
+          box_width = min_size * sqrt(ar);
+          box_height = min_size / sqrt(ar);
+          //! xmin
+          com_buf[com_idx++] = (center_x - box_width / 2.f) / img_width;
+          //! ymin
+          com_buf[com_idx++] = (center_y - box_height / 2.f) / img_height;
+          //! xmax
+          com_buf[com_idx++] = (center_x + box_width / 2.f) / img_width;
+          //! ymax
+          com_buf[com_idx++] = (center_y + box_height / 2.f) / img_height;
+        }
+        if (param->min_max_aspect_ratios_order) {
+          memcpy(cpu_data + idx, min_buf, sizeof(float) * min_idx);
+          idx += min_idx;
+          memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
+          idx += max_idx;
+          memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
+          idx += com_idx;
+        } else {
+          memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
+          idx += com_idx;
+          memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
+          idx += max_idx;
+        }
+      }
+      free(min_buf);
+      free(max_buf);
+      free(com_buf);
+    }
+  }
+  //! clip the prior's coordidate such that it is within [0, 1]
+  if (param->clip) {
+    for (int32_t d = 0; d < channel_size; ++d) {
+      cpu_data[d] = std::min(std::max(cpu_data[d], 0.f), 1.f);
+    }
+  }
+  //! set the variance.
+  float* ptr = cpu_data + channel_size;
+  int count = 0;
+  for (int32_t h = 0; h < height; ++h) {
+    for (int32_t w = 0; w < width; ++w) {
+      for (int32_t i = 0; i < param->prior_num; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          ptr[count] = param->variances[j];
+          ++count;
+        }
+      }
+    }
+  }
+  return cpu_data;
+}
+
+int PriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  std::vector<int32_t> i_input_shape_data(in_dims.size());
+  for (size_t i = 0; i < in_dims.size(); i++) {
+    i_input_shape_data[i] = static_cast<int32_t>(in_dims[i]);
+  }
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // param
+  st_priorbox_param param;
+  param.clip = op_info->GetAttr<bool>("clip");
+  param.min_sizes = op_info->GetAttr<std::vector<float>>("min_sizes");
+  param.max_sizes = op_info->GetAttr<std::vector<float>>("max_sizes");
+  param.aspect_ratios = op_info->GetAttr<std::vector<float>>("aspect_ratios");
+  param.variances = op_info->GetAttr<std::vector<float>>("variances");
+  param.offset = op_info->GetAttr<float>("offset");
+  if (op_info->HasAttr("flip")) {
+    param.flip = op_info->GetAttr<bool>("flip");
+  }
+  if (op_info->HasAttr("img_w")) {
+    param.img_w = op_info->GetAttr<int32_t>("img_w");
+  }
+  if (op_info->HasAttr("img_h")) {
+    param.img_h = op_info->GetAttr<int32_t>("img_h");
+  }
+  if (op_info->HasAttr("step_w")) {
+    param.step_w = op_info->GetAttr<float>("step_w");
+  }
+  if (op_info->HasAttr("step_h")) {
+    param.step_h = op_info->GetAttr<float>("step_h");
+  }
+  if (op_info->HasAttr("prior_num")) {
+    param.prior_num = op_info->GetAttr<int32_t>("prior_num");
+  }
+  if (op_info->HasAttr("min_max_aspect_ratios_order")) {
+    param.min_max_aspect_ratios_order =
+        op_info->GetAttr<bool>("min_max_aspect_ratios_order");
+  }
+  float* cpu_data = compute_priorbox_kernel(op, &param);
+  compute_priorbox_kernel(op, param);
+  auto boxes_dims = boxes->dims();
+  std::vector<int32_t> i_pri_out_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_pri_out_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  i_pri_out_shape_data[0] *= 2;
+  add_priorbox_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_input_shape_data[0]),
+                     in_dims.size(),
+                     static_cast<const char*>(in_var_name.c_str()),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     boxes_dims.size(),
+                     static_cast<const char*>(unique_op_name.c_str()),
+                     static_cast<const float*>(cpu_data),
+                     param.min_sizes.size(),
+                     const_cast<const float*>(&param.min_sizes[0]),
+                     param.max_sizes.size(),
+                     const_cast<const float*>(&param.max_sizes[0]),
+                     param.aspect_ratios.size(),
+                     const_cast<const float*>(&param.aspect_ratios[0]),
+                     static_cast<int>(param.flip),
+                     static_cast<int>(param.clip),
+                     param.variances.size(),
+                     const_cast<const float*>(&param.variances[0]),
+                     param.img_h,
+                     param.img_w,
+                     param.step_h,
+                     param.step_w,
+                     param.offset);
+  std::vector<int32_t> i_output_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  int32_t* shape[2];
+  int dim[2];
+  const char* name[2];
+  dim[0] = boxes_dims.size();
+  dim[1] = boxes_dims.size();
+  name[0] = static_cast<const char*>(boxes_var_name.c_str());
+  name[1] = static_cast<const char*>(var_var_name.c_str());
+  shape[0] = &i_output_shape_data[0];
+  shape[1] = &i_output_shape_data[0];
+  int split_size = 2;
+  add_tf_split_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     boxes_dims.size(),
+                     static_cast<const char*>(unique_op_name.c_str()),
+                     2,
+                     shape,
+                     dim,
+                     name,
+                     boxes_dims.size(),
+                     0,
+                     &split_size,
+                     0);
+  graph->AddNode(boxes_var_name);
+  graph->AddNode(var_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(prior_box,
+                         kBM,
+                         paddle::lite::subgraph::bm::PriorBoxConverter);
diff --git a/lite/kernels/bm/bridges/reshape_op.cc b/lite/kernels/bm/bridges/reshape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..64f4ffe9f3909b28c5237a8be88ba54fec4b1b83
--- /dev/null
+++ b/lite/kernels/bm/bridges/reshape_op.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int32_t>(x_dims[i]);
+  }
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(output_dims[i]);
+  }
+  // auto axis = op_info->GetAttr<int>("axis");
+  add_reshape_layer_v2(graph->GetCompilerHandle(),
+                       static_cast<const char*>(x_var_name.c_str()),
+                       const_cast<const int*>(&i_x_shape_data[0]),
+                       x_dims.size(),
+                       static_cast<const char*>(output_var_name.c_str()),
+                       const_cast<const int*>(&i_output_shape_data[0]),
+                       output_dims.size());
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(reshape,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(reshape2,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(flatten,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(flatten2,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
diff --git a/lite/kernels/bm/bridges/scale_op.cc b/lite/kernels/bm/bridges/scale_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bbbf8961166dff2cfc0e99926303a2cf515a8b92
--- /dev/null
+++ b/lite/kernels/bm/bridges/scale_op.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto scale = op_info->GetAttr<float>("scale");
+  auto bias = op_info->GetAttr<float>("bias");
+  auto bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
+  if (!bias_after_scale) {
+    bias *= scale;
+  }
+  auto unique_op_scale_name = lite::subgraph::bm::UniqueName(op_type);
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         scale,
+                         static_cast<const char*>(unique_op_scale_name.c_str()),
+                         BINARY_MUL,
+                         0);
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(unique_op_scale_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         bias,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         BINARY_ADD,
+                         0);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(scale,
+                         kBM,
+                         paddle::lite::subgraph::bm::ScaleConverter);
diff --git a/lite/kernels/bm/bridges/softmax_op.cc b/lite/kernels/bm/bridges/softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5de58872ac0e5a0536f2c746357f38f4ff664688
--- /dev/null
+++ b/lite/kernels/bm/bridges/softmax_op.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  size_t length = x_dims.size();
+  std::vector<int32_t> i_x_shape_data(length);
+  for (size_t i = 0; i < length; i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  length = output_dims.size();
+  std::vector<int32_t> i_output_shape_data(length);
+  for (size_t i = 0; i < length; i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  int32_t axis = -1;
+  if (op_info->HasAttr("axis")) {
+    axis = op_info->GetAttr<int>("axis");
+  }
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+  int outer_num = x_dims.Slice(0, axis).production();
+  int inner_num = x_dims.Slice(axis + 1, x_dims.size()).production();
+  add_softmax_layer(graph->GetCompilerHandle(),
+                    const_cast<const int*>(&i_x_shape_data[0]),
+                    x_dims.size(),
+                    static_cast<const char*>(x_var_name.c_str()),
+                    const_cast<const int*>(&i_output_shape_data[0]),
+                    output_dims.size(),
+                    static_cast<const char*>(output_var_name.c_str()),
+                    inner_num,
+                    outer_num,
+                    x_dims[axis]);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kBM,
+                         paddle::lite::subgraph::bm::SoftmaxConverter);
diff --git a/lite/kernels/bm/bridges/transpose_op.cc b/lite/kernels/bm/bridges/transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bab24a96b9920212337f6afd3c1c73f582a48975
--- /dev/null
+++ b/lite/kernels/bm/bridges/transpose_op.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto axis = op_info->GetAttr<std::vector<int>>("axis");
+  CHECK_EQ(axis.size(), x_dims.size());
+  add_transpose_layer_v2(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         DTYPE_FP32,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         NULL,
+                         const_cast<const int*>(&axis[0]));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(transpose,
+                         kBM,
+                         paddle::lite::subgraph::bm::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(transpose2,
+                         kBM,
+                         paddle::lite::subgraph::bm::TransposeConverter);
diff --git a/lite/kernels/bm/bridges/utility.cc b/lite/kernels/bm/bridges/utility.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa61462d046e1d21b49517a6362b54a884a6b6de
--- /dev/null
+++ b/lite/kernels/bm/bridges/utility.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/bm/bridges/utility.h"
+#include <mutex>  //NOLINT
+#include <unordered_map>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+std::string UniqueName(const std::string& prefix) {
+  static std::mutex counter_mtx;
+  static std::unordered_map<std::string, int> counter_map;
+  std::unique_lock<std::mutex> counter_lck(counter_mtx);
+  int counter = 1;
+  auto it = counter_map.find(prefix);
+  if (it == counter_map.end()) {
+    counter_map[prefix] = counter;
+  } else {
+    counter = ++(it->second);
+  }
+
+  return prefix + "_" + std::to_string(counter);
+}
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/bm/bridges/utility.h b/lite/kernels/bm/bridges/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..55910bc47c767209f8cab254f281c9cf578d1696
--- /dev/null
+++ b/lite/kernels/bm/bridges/utility.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+std::string UniqueName(const std::string& prefix);
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e47102d767becdea0f0d3d50aa30d6933d6ef8d
--- /dev/null
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/bm/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/paddle_use_bridges.h"
+#include "lite/kernels/bm/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace bm {
+
+int SubgraphEngine::BuildDeviceProgram() {
+  int status = 0;
+  subgraph::bm::Graph graph;
+  const auto& bridges = subgraph::Registry::Instance();
+  graph.CreateCompilerHandle();
+  auto& ctx = this->ctx_->template As<BMContext>();
+  for (auto& inst : origin_program_) {
+    auto op = inst.op();
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kBM))) {
+      return subgraph::FAILED;
+    }
+    auto kernel = inst.kernel();
+    status |=
+        bridges.Select(op_type, TARGET(kBM))(reinterpret_cast<void*>(&graph),
+                                             const_cast<OpLite*>(op),
+                                             const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  std::string net_name = "paddle_bitmain";
+  __bmcompile_opt(
+      graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 1);
+  void* bmodel_data = nullptr;
+  unsigned int data_size = 0;
+  bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
+  finish_bmcompiler_data(graph.GetCompilerHandle(), &bmodel_data, &data_size);
+  bmrt_hd_ = bmrt_create(bm_hd_);
+  if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) {
+    return subgraph::FAILED;
+  }
+  bmrt_get_network_names(bmrt_hd_, &net_names_);
+  net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]);
+  auto& stage = net_info_->stages[0];
+  // input
+  origin_idims_.resize(input_names_.size());
+  origin_itensors_.resize(input_names_.size());
+  device_inputs_.resize(input_names_.size());
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+    bm_device_mem_t* p_mem =
+        static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
+    CHECK(p_mem != nullptr);
+    CHECK_EQ(bm_malloc_device_byte(
+                 bm_hd_, p_mem, origin_itensors_[i]->memory_size()),
+             BM_SUCCESS);
+    bmrt_tensor_with_device(&device_inputs_[i],
+                            *p_mem,
+                            net_info_->input_dtypes[i],
+                            stage.input_shapes[i]);
+  }
+  // output
+  origin_odims_.resize(output_names_.size());
+  origin_otensors_.resize(output_names_.size());
+  device_outputs_.resize(output_names_.size());
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    output_map_.insert(std::pair<std::string, int>(output_names_[i], i));
+    origin_otensors_[i]->mutable_data<float>();
+  }
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    int mapping_index = output_map_.at(net_info_->output_names[i]);
+    bm_device_mem_t* p_mem =
+        static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
+    CHECK(p_mem != nullptr);
+    CHECK_EQ(bm_malloc_device_byte(
+                 bm_hd_, p_mem, origin_otensors_[mapping_index]->memory_size()),
+             BM_SUCCESS);
+    bmrt_tensor_with_device(&device_outputs_[i],
+                            *p_mem,
+                            net_info_->output_dtypes[i],
+                            stage.output_shapes[i]);
+  }
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  for (size_t i = 0; i < device_inputs_.size(); i++) {
+    bm_memcpy_s2d(bm_hd_,
+                  device_inputs_[i].device_mem,
+                  const_cast<void*>(origin_itensors_[i]->raw_data()));
+  }
+  bmrt_launch_tensor_ex(bmrt_hd_,
+                        net_names_[0],
+                        static_cast<const bm_tensor_t*>(&device_inputs_[0]),
+                        net_info_->input_num,
+                        static_cast<bm_tensor_t*>(&device_outputs_[0]),
+                        net_info_->output_num,
+                        true,
+                        false);
+  bm_thread_sync(bm_hd_);
+  for (size_t i = 0; i < device_outputs_.size(); i++) {
+    bm_memcpy_d2s(bm_hd_,
+                  const_cast<void*>(origin_otensors_[i]->raw_data()),
+                  device_outputs_[i].device_mem);
+  }
+  return 0;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace bm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kBM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::bm::SubgraphCompute,
+                     def)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/bm/subgraph_compute.h b/lite/kernels/bm/subgraph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e4b1dfa32fe79767cc2ec9d42d3b43e862001f3
--- /dev/null
+++ b/lite/kernels/bm/subgraph_compute.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <bmcompiler_if.h>
+#include <bmruntime_interface.h>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/program.h"
+#include "lite/core/types.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace bm {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+
+ private:
+  void *bmrt_hd_;
+  std::vector<bm_tensor_t> device_inputs_;
+  std::vector<bm_tensor_t> device_outputs_;
+  std::map<std::string, int> output_map_;
+  const char **net_names_;
+  const bm_net_info_t *net_info_;
+  bm_handle_t bm_hd_;
+};
+
+class SubgraphCompute : public KernelLite<TARGET(kBM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SubgraphParam;
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace bm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index ac2e62f8cfff295c418c315d4fb521d69fe26a18..2df00f00a4eefd8fc6f9bee5e0c9b76656232041 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -1,35 +1,41 @@
-if(NOT LITE_WITH_CUDA)
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_CUDA))
     return()
 endif()
 
 message(STATUS "compile with lite CUDA kernels")
 
 add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} context)
+add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_pool_concat_compute_cuda CUDA extra SRCS sequence_pool_concat_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(transpose_compute_cuda CUDA basic SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda} cuda_transpose)
 add_kernel(nearest_interp_compute_cuda CUDA basic SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(conv2d_cuda CUDA basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(concat_compute_cuda CUDA basic SRCS concat_compute.cu DEPS ${lite_kernel_deps})
-add_kernel(elementwise_add_compute_cuda CUDA basic SRCS elementwise_add_compute.cu DEPS ${lite_kernel_deps} cuda_elementwise)
+add_kernel(elementwise_compute_cuda CUDA basic SRCS elementwise_compute.cu DEPS ${lite_kernel_deps} cuda_elementwise)
 add_kernel(calib_compute_cuda CUDA basic SRCS calib_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(layout_compute_cuda CUDA basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} cuda_transpose)
 add_kernel(feed_compute_cuda CUDA basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(scale_compute_cuda CUDA basic SRCS scale_compute.cc DEPS ${lite_kernel_deps} cuda_scale)
 add_kernel(dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} cuda_scale)
 add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps})
-add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS
+${lite_kernel_deps} cudnn_pool)
 add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS ${lite_kernel_deps})
-add_kernel(search_seq_depadding_compute_cuda CUDA basic SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm ${math_cuda})
 add_kernel(sequence_reverse_compute_cuda CUDA basic SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_concat_compute_cuda CUDA basic SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_arithmetic_compute_cuda CUDA basic SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps})
-add_kernel(match_matrix_tensor_compute_cuda CUDA basic SRCS match_matrix_tensor_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
+add_kernel(search_fc_compute_cuda CUDA basic SRCS search_fc_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(sequence_topk_avg_pooling_compute_cuda CUDA basic SRCS sequence_topk_avg_pooling_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(match_matrix_tensor_compute_cuda CUDA extra SRCS match_matrix_tensor_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
 add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} cuda_batched_gemm)
 add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
 add_kernel(var_conv_2d_compute_cuda CUDA basic SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
@@ -41,23 +47,27 @@ nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_
 nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda)
 nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
 nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
+nv_test(search_group_padding_compute_cuda_test SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_cuda)
 nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda)
-nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda)
-nv_test(sequence_pool_compute_cuda_test SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_cuda)
+nv_test(elementwise_compute_cuda_test SRCS elementwise_compute_test.cc DEPS elementwise_compute_cuda)
 nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda)
 #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
-nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
+nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) 
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
 nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)
-nv_test(search_seq_depadding_compute_cuda_test SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_cuda)
+nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda)
 nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda)
 nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
 nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
 nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
-nv_test(match_matrix_tensor_compute_cuda_test SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_cuda)
+nv_test(search_fc_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda)
 nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
 
 if(LITE_BUILD_EXTRA)
+    nv_test(search_seq_depadding_compute_cuda_test SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_cuda)
+    nv_test(match_matrix_tensor_compute_cuda_test SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_cuda)
+    nv_test(search_grnn_compute_cuda_test SRCS search_grnn_compute_test.cc DEPS search_grnn_compute_cuda)
+    nv_test(sequence_pool_compute_cuda_test SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_cuda)
     nv_test(lookup_table_compute_cuda_test SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_cuda)
     nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda)
     nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda)
diff --git a/lite/kernels/cuda/attention_padding_mask_compute.cu b/lite/kernels/cuda/attention_padding_mask_compute.cu
index 8627903b23eb8bdf0e1a42d647409fa82fc5904d..fac73b1adc49fd90fbda33669aee53e4126a6649 100644
--- a/lite/kernels/cuda/attention_padding_mask_compute.cu
+++ b/lite/kernels/cuda/attention_padding_mask_compute.cu
@@ -40,6 +40,7 @@ __global__ void ker_attention_padding_mask(T* out_data,
                                            const int attn_seq_len,
                                            const int src_seq_num,
                                            const int src_seq_len,
+                                           const T* pad_begin_data,
                                            const T mask,
                                            const int count) {
   CUDA_KERNEL_LOOP(tid, count) {
@@ -49,7 +50,12 @@ __global__ void ker_attention_padding_mask(T* out_data,
     int attn_word_id = tmp_tid % attn_seq_len;
     int src_seq_id = attn_seq_id % src_seq_num;
     int cur_len = src_offset[src_seq_id + 1] - src_offset[src_seq_id];
-    if (src_word_id >= cur_len) {
+
+    int k = static_cast<int>(pad_begin_data[src_seq_id]);
+    if (k < cur_len &&
+        tid >= src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) + k &&
+        tid < src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) +
+                  cur_len) {
       out_data[tid] = mask;
     } else {
       out_data[tid] = attn_data[tid];
@@ -79,6 +85,35 @@ void AttentionPaddingMaskCompute::Run() {
   auto attn_data = attn->data<float>();
   auto out_data = out->mutable_data<float>(TARGET(kCUDA));
 
+  std::vector<float> src_cpu(src->numel(), 0);
+  TargetWrapperCuda::MemcpyAsync(src_cpu.data(),
+                                 src->data<float>(),
+                                 sizeof(float) * src->numel(),
+                                 IoDirection::DtoH,
+                                 stream);
+  cudaStreamSynchronize(stream);
+
+  std::vector<float> pad_begin(src_seq_num, 0);
+  auto src_len = static_cast<int64_t>(src->lod()[0][1]);
+  int _pad_id = param.pad_id;
+  for (int i = 0; i < src_seq_num; ++i) {
+    const auto* src_data = src_cpu.data() + src_len * i;
+    int index = src_len - 1;
+    for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
+         --index) {
+    }
+    pad_begin[i] = static_cast<float>(index + 1);
+  }
+
+  param.pad_begin->Resize({static_cast<int64_t>(src_seq_num)});
+  auto pad_begin_cuda_data =
+      param.pad_begin->mutable_data<float>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(pad_begin_cuda_data,
+                                 pad_begin.data(),
+                                 sizeof(float) * src_seq_num,
+                                 IoDirection::HtoD,
+                                 stream);
+
   std::vector<int> src_offset_cpu(src_offset.size(), 0);
   for (int i = 0; i < src_offset.size(); i++) {
     src_offset_cpu[i] = src_offset[i];
@@ -101,11 +136,12 @@ void AttentionPaddingMaskCompute::Run() {
       attn_seq_len,
       src_seq_num,
       src_seq_len,
+      pad_begin_cuda_data,
       param.mask,
       count);
 
   cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
 }
 
 }  // namespace cuda
@@ -113,7 +149,7 @@ void AttentionPaddingMaskCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(attention_padding_mask,
+REGISTER_LITE_KERNEL(search_attention_padding_mask,
                      kCUDA,
                      kFloat,
                      kNCHW,
diff --git a/lite/kernels/cuda/calib_compute_cuda_test.cc b/lite/kernels/cuda/calib_compute_cuda_test.cc
index 8703d8730a1880b5b93502e5095b1a17d03bee6c..fdb47f7dd3c2e6d8f82e0281b81b24ebe444909a 100644
--- a/lite/kernels/cuda/calib_compute_cuda_test.cc
+++ b/lite/kernels/cuda/calib_compute_cuda_test.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/kernels/cuda/calib_compute.h"
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <memory>
@@ -58,12 +59,7 @@ void calib_ref(const operators::CalibParam& param, bool to_float = true) {
 }
 
 TEST(calib_cuda, int8_to_fp32) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto calib = std::move(*std::next(kernels.begin(), 1));
-  LOG(INFO) << "get kernel: " << calib->doc();
+  CalibComputeInt8ToFp32 calib;
   const int n = 64, c = 32, h = 18, w = 18;
   Tensor x;
   Tensor x_cpu;
@@ -87,14 +83,14 @@ TEST(calib_cuda, int8_to_fp32) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   context.SetExecStream(stream);
-  calib->SetContext(std::move(ctx));
+  calib.SetContext(std::move(ctx));
 
   operators::CalibParam param;
   param.scale = 0.013f;
   param.input = &x;
   param.output = &output;
-  calib->SetParam(param);
-  calib->Launch();
+  calib.SetParam(param);
+  calib.Launch();
   cudaDeviceSynchronize();
   // invoking ref implementation and compare results
   param.input = &x_cpu;
@@ -113,12 +109,7 @@ TEST(calib_cuda, int8_to_fp32) {
 }
 
 TEST(calib_cuda, fp32_to_int8) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto calib = std::move(kernels.front());
-  LOG(INFO) << "get kernel: " << calib->doc();
+  CalibComputeFp32ToInt8 calib;
   const int n = 64, c = 32, h = 18, w = 18;
   Tensor x;
   Tensor x_cpu;
@@ -142,14 +133,14 @@ TEST(calib_cuda, fp32_to_int8) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   context.SetExecStream(stream);
-  calib->SetContext(std::move(ctx));
+  calib.SetContext(std::move(ctx));
 
   operators::CalibParam param;
   param.scale = 0.013f;
   param.input = &x;
   param.output = &output;
-  calib->SetParam(param);
-  calib->Launch();
+  calib.SetParam(param);
+  calib.Launch();
   cudaDeviceSynchronize();
   // invoking ref implementation and compare results
   param.input = &x_cpu;
diff --git a/lite/kernels/cuda/conv_compute.cc b/lite/kernels/cuda/conv_compute.cc
index eea81602ddf94158250aecf01fe5e95193bf58c1..468ed0cbd06a1b20596cef9ba8a7f0998de7fe73 100644
--- a/lite/kernels/cuda/conv_compute.cc
+++ b/lite/kernels/cuda/conv_compute.cc
@@ -21,10 +21,14 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-inline int ConvOutputSize(
-    int input_size, int filter_size, int dilation, int padding, int stride) {
+inline int ConvOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int pad_left,
+                          int pad_right,
+                          int stride) {
   const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  int output_size = (input_size + pad_left + pad_right - dkernel) / stride + 1;
   CHECK_GT_OR_FALSE(output_size, 0);
 
   return output_size;
@@ -50,11 +54,15 @@ void ConvComputeInt8<Ptype_out>::PrepareForRun() {
   const auto filter_dims = param.filter->dims();
   std::vector<int64_t> output_shape({in_dims[0]});
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   for (size_t i = 0; i < param.strides.size(); ++i) {
     output_shape.push_back(ConvOutputSize(in_dims[i + 1],
                                           filter_dims[i + 1],
-                                          param.dilations[i],
-                                          param.paddings[i],
+                                          dilations[i],
+                                          paddings[2 * i],
+                                          paddings[2 * i + 1],
                                           param.strides[i]));
   }
   output_shape.push_back(filter_dims[0]);
@@ -71,12 +79,15 @@ void ConvComputeInt8<Ptype_out>::Run() {
   const auto in_dims = param.x->dims();
   const auto filter_dims = param.filter->dims();
   std::vector<int64_t> output_shape({in_dims[0]});
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
 
   for (size_t i = 0; i < param.strides.size(); ++i) {
     output_shape.push_back(ConvOutputSize(in_dims[i + 1],
                                           filter_dims[i + 1],
-                                          param.dilations[i],
-                                          param.paddings[i],
+                                          dilations[i],
+                                          paddings[2 * i],
+                                          paddings[2 * i + 1],
                                           param.strides[i]));
   }
   output_shape.push_back(filter_dims[0]);
diff --git a/lite/kernels/cuda/conv_compute_test.cc b/lite/kernels/cuda/conv_compute_test.cc
index 05175a0debcd687a2e5e06fa799839ad52c50adb..46b63f2e310d2e24a3935eb2f66c8c9d4a339712 100644
--- a/lite/kernels/cuda/conv_compute_test.cc
+++ b/lite/kernels/cuda/conv_compute_test.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/cuda/conv_compute.h"
 #include <gtest/gtest.h>
 #include <memory>
+#include <random>
 #include <utility>
 #include <vector>
 
@@ -41,7 +42,10 @@ TEST(conv_compute, fp32) {
   act_param.Leaky_relu_alpha = 0.1;
   operators::ConvParam param;
   param.activation_param = act_param;
-  param.paddings = {1, 1};
+  std::vector<int> pads = {1, 1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   param.groups = 1;
 
   Tensor x, filter, bias, y, x_cpu, filter_cpu, bias_cpu, y_cpu;
@@ -148,6 +152,10 @@ TEST(conv_compute, int8) {
   bias.Assign<float, lite::DDim, TARGET(kCUDA)>(bias_cpu_data,
                                                 filter_cpu.dims());
 
+  std::vector<int> pads = {0, 0, 0, 0};
+  std::vector<int> dilations = {1, 1, 1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   param.x = &x;
   param.filter = &filter;
   param.output = &y;
@@ -202,12 +210,10 @@ TEST(conv_compute, int8_int8_out) {
   std::cout << "input" << std::endl;
   for (int i = 0; i < x_cpu.numel(); i++) {
     x_cpu_data[i] = static_cast<int8_t>(random(-36, 36));
-    std::cout << float(x_cpu_data[i]) << std::endl;
   }
   std::cout << "filter" << std::endl;
   for (int i = 0; i < filter_cpu.numel(); i++) {
     filter_cpu_data[i] = static_cast<int8_t>(random(-10, 10));
-    std::cout << float(filter_cpu_data[i]) << std::endl;
   }
   for (int i = 0; i < bias_cpu.numel(); i++) {
     bias_cpu_data[i] = i + 1.0;
@@ -220,6 +226,10 @@ TEST(conv_compute, int8_int8_out) {
   bias.Assign<float, lite::DDim, TARGET(kCUDA)>(bias_cpu_data,
                                                 filter_cpu.dims());
 
+  std::vector<int> pads = {0, 0, 0, 0};
+  std::vector<int> dilations = {1, 1, 1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   param.x = &x;
   param.filter = &filter;
   param.output = &y;
diff --git a/lite/kernels/cuda/elementwise_add_compute.cu b/lite/kernels/cuda/elementwise_add_compute.cu
deleted file mode 100644
index 4bacf532a2b67168679449200b1af721b7a282c8..0000000000000000000000000000000000000000
--- a/lite/kernels/cuda/elementwise_add_compute.cu
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "lite/backends/cuda/math/elementwise.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/cuda/elementwise_add_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-void ElementwiseAddCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  const lite::Tensor* x = param.X;
-  const lite::Tensor* y = param.Y;
-  lite::Tensor* out = param.Out;
-
-  CHECK(x->dims().production() == y->dims().production());
-
-  auto* x_data = x->data<float>();
-  auto* y_data = y->data<float>();
-  auto out_data = out->mutable_data<float>(TARGET(kCUDA));
-
-  int pixel_num = x->numel();
-  lite::cuda::math::elementwise_add(
-      pixel_num, x_data, y_data, out_data, stream);
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-void ElementwiseAddComputeNHWC::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  const lite::Tensor* x = param.X;
-  const lite::Tensor* y = param.Y;
-  lite::Tensor* out = param.Out;
-
-  CHECK(x->dims().production() == y->dims().production());
-
-  auto* x_data = x->data<float>();
-  auto* y_data = y->data<float>();
-  auto out_data = out->mutable_data<float>(TARGET(kCUDA));
-
-  int pixel_num = x->numel();
-  lite::cuda::math::elementwise_add(
-      pixel_num, x_data, y_data, out_data, stream);
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-void ElementwiseAddComputeInt8::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  const lite::Tensor* x = param.X;
-  const lite::Tensor* y = param.Y;
-  lite::Tensor* out = param.Out;
-
-  CHECK(x->dims().production() == y->dims().production());
-
-  const int c = x->dims()[3];
-
-  auto* x_data = x->data<float>();
-  auto* y_data = y->data<float>();
-  auto out_data = out->mutable_data<int8_t>(TARGET(kCUDA));
-
-  int pixel_num = x->numel();
-  float output_scale = param.output_scale;
-  if (c % 4 == 0) {
-    lite::cuda::math::elementwise_add_nhwc4_int8(
-        pixel_num / 4,
-        static_cast<const void*>(x_data),
-        static_cast<const void*>(y_data),
-        1. / output_scale,
-        static_cast<void*>(out_data),
-        stream);
-  } else {
-    lite::cuda::math::elementwise_add_int8(
-        pixel_num, x_data, y_data, 1. / output_scale, out_data, stream);
-  }
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(elementwise_add,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::ElementwiseAddCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(elementwise_add,
-                     kCUDA,
-                     kFloat,
-                     kNHWC,
-                     paddle::lite::kernels::cuda::ElementwiseAddComputeNHWC,
-                     nhwc_format)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kCUDA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("Y",
-               {LiteType::GetTensorTy(TARGET(kCUDA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kCUDA),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/cuda/elementwise_compute.cu b/lite/kernels/cuda/elementwise_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..64759f86f5df85f9855b9c1f186bbc9c039a044c
--- /dev/null
+++ b/lite/kernels/cuda/elementwise_compute.cu
@@ -0,0 +1,318 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <map>
+#include <vector>
+#include "lite/backends/cuda/math/elementwise.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/elementwise_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+inline DDim trim_trailing_singular_dims(const DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  return DDim(trim_dims);
+}
+
+inline bool is_broadcast(const DDim& x_dims,
+                         const DDim& y_dims,
+                         int axis,
+                         int* pre,
+                         int* n,
+                         int* post) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  DDim y_dim_trim = trim_trailing_singular_dims(y_dims);
+  axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis;
+  if (x_dims.size() == y_dim_trim.size()) {
+    return false;
+  }
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dim_trim.size(); ++i) {
+    CHECK_EQ(x_dims[i + axis], y_dim_trim[i])
+        << "Broadcast dimension mismatch.";
+    (*n) *= y_dim_trim[i];
+  }
+  for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+  return true;
+}
+
+#define ELEMENTWISE_COMPUTE(OP, WITH_RELU)                           \
+  auto& param = this->Param<param_t>();                              \
+  auto& ctx = this->ctx_->template As<CUDAContext>();                \
+  auto stream = ctx.exec_stream();                                   \
+  const lite::Tensor* x = param.X;                                   \
+  const lite::Tensor* y = param.Y;                                   \
+  lite::Tensor* out = param.Out;                                     \
+  int axis = param.axis;                                             \
+  auto* x_data = x->data<float>();                                   \
+  auto* y_data = y->data<float>();                                   \
+  auto out_data = out->mutable_data<float>(TARGET(kCUDA));           \
+  int pixel_num = x->numel();                                        \
+  int pre = 1;                                                       \
+  int n = pixel_num;                                                 \
+  int post = 1;                                                      \
+  if (WITH_RELU) {                                                   \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  } else {                                                           \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  }
+
+#define ELEMENTWISE_COMPUTE_NHWC(OP, WITH_RELU)                      \
+  std::map<int, int> pos_map = {{0, 0}, {1, 3}, {2, 1}, {3, 2}};     \
+  auto& param = this->Param<param_t>();                              \
+  auto& ctx = this->ctx_->template As<CUDAContext>();                \
+  auto stream = ctx.exec_stream();                                   \
+  const lite::Tensor* x = param.X;                                   \
+  const lite::Tensor* y = param.Y;                                   \
+  lite::Tensor* out = param.Out;                                     \
+  int axis = param.axis;                                             \
+  if (axis < 0) axis = x->dims().size() - y->dims().size();          \
+  CHECK(axis >= 0) << "invalid axis of elementwise op";              \
+  axis = pos_map[axis];                                              \
+  auto* x_data = x->data<float>();                                   \
+  auto* y_data = y->data<float>();                                   \
+  auto out_data = out->mutable_data<float>(TARGET(kCUDA));           \
+  int pixel_num = x->numel();                                        \
+  int pre = 1;                                                       \
+  int n = pixel_num;                                                 \
+  int post = 1;                                                      \
+  if (WITH_RELU) {                                                   \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  } else {                                                           \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  }
+
+void ElementwiseAddCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kADD, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseAddComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kADD, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kMUL, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseAddReluCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kADD, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseAddReluComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kADD, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulReluCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulReluComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kMUL, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseAddCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseAddComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseMulCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseMulComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseAddReluCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseAddReluComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseMulReluCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseMulReluComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/cuda/elementwise_compute.h b/lite/kernels/cuda/elementwise_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..986a4db2272d9a6607090babd937747f861f49c7
--- /dev/null
+++ b/lite/kernels/cuda/elementwise_compute.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddCompute() = default;
+};
+
+class ElementwiseAddComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddComputeNHWC() = default;
+};
+
+class ElementwiseMulCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulCompute() = default;
+};
+
+class ElementwiseMulComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulComputeNHWC() = default;
+};
+
+class ElementwiseAddReluCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddReluCompute() = default;
+};
+
+class ElementwiseAddReluComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddReluComputeNHWC() = default;
+};
+
+class ElementwiseMulReluCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulReluCompute() = default;
+};
+
+class ElementwiseMulReluComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulReluComputeNHWC() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/elementwise_add_compute_test.cc b/lite/kernels/cuda/elementwise_compute_test.cc
similarity index 55%
rename from lite/kernels/cuda/elementwise_add_compute_test.cc
rename to lite/kernels/cuda/elementwise_compute_test.cc
index cc63f1470b65de37eb73c71701a83146e12778ae..9fd0b7754f2d3209137b5f4862dfe1e90279f3be 100644
--- a/lite/kernels/cuda/elementwise_add_compute_test.cc
+++ b/lite/kernels/cuda/elementwise_compute_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/cuda/elementwise_add_compute.h"
+#include "lite/kernels/cuda/elementwise_compute.h"
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
@@ -31,6 +31,14 @@ static void ElementwiseAddRef(float* x, float* y, float* out, int num) {
   }
 }
 
+static void ElementwiseBroadcastRef(
+    float* x, float* y, float* out, int pre, int n, int post) {
+  for (int i = 0; i < pre * n * post; ++i) {
+    int idx = (i / post) % n;
+    out[i] = x[i] + y[idx];
+  }
+}
+
 TEST(elementwise_add, normal) {
   ElementwiseAddCompute elementwise_add_kernel;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
@@ -99,38 +107,117 @@ TEST(elementwise_add, normal) {
   }
 }
 
-TEST(elementwise_add, int8_out) {
-  ElementwiseAddComputeInt8 elementwise_add_kernel;
+TEST(elementwise_add, bias) {
+  ElementwiseAddCompute elementwise_add_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::ElementwiseParam param;
+  Tensor x, y, out;
+  Tensor x_cpu, y_cpu, out_cpu;
+  Tensor x_ref, y_ref, out_ref;
+
+  const int n = 1;
+  const int c = 3;
+  const int h = 2000;
+  const int w = 2000;
+
+  x.Resize({n, c, h, w});
+  y.Resize({c, 1, 1});
+  out.Resize({n, c, h, w});
+  x_cpu.Resize({n, c, h, w});
+  y_cpu.Resize({c, 1, 1});
+  out_cpu.Resize({n, c, h, w});
+  x_ref.Resize({n, c, h, w});
+  y_ref.Resize({c, 1, 1});
+  out_ref.Resize({n, c, h, w});
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* y_cpu_data = y_cpu.mutable_data<float>();
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+
+  auto* x_ref_data = x_ref.mutable_data<float>();
+  auto* y_ref_data = y_ref.mutable_data<float>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+  for (int i = 0; i < y_cpu.numel(); ++i) {
+    y_cpu_data[i] = i - 5.0;
+    y_ref_data[i] = i - 5.0;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
+
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  param.axis = -1;
+  elementwise_add_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  elementwise_add_kernel.SetContext(std::move(ctx));
+  elementwise_add_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  ElementwiseBroadcastRef(x_ref_data, y_ref_data, out_ref_data, n, c, h * w);
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(elementwise_add_nhwc, bias) {
+  ElementwiseAddComputeNHWC elementwise_add_kernel;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   auto& context = ctx->As<CUDAContext>();
 
   operators::ElementwiseParam param;
   Tensor x, y, out;
   Tensor x_cpu, y_cpu, out_cpu;
+  Tensor x_ref, y_ref, out_ref;
 
   const int n = 1;
-  const int h = 36;
-  const int w = 36;
-  const int c = 125;
+  const int c = 3;
+  const int h = 2000;
+  const int w = 2000;
 
   x.Resize({n, h, w, c});
-  y.Resize({n, h, w, c});
+  y.Resize({c, 1, 1});
   out.Resize({n, h, w, c});
   x_cpu.Resize({n, h, w, c});
-  y_cpu.Resize({n, h, w, c});
+  y_cpu.Resize({c, 1, 1});
   out_cpu.Resize({n, h, w, c});
+  x_ref.Resize({n, h, w, c});
+  y_ref.Resize({c, 1, 1});
+  out_ref.Resize({n, h, w, c});
 
-  auto* out_data = out.mutable_data<int8_t>(TARGET(kCUDA));
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
 
   auto* x_cpu_data = x_cpu.mutable_data<float>();
   auto* y_cpu_data = y_cpu.mutable_data<float>();
-  auto* out_cpu_data = out_cpu.mutable_data<int8_t>();
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+
+  auto* x_ref_data = x_ref.mutable_data<float>();
+  auto* y_ref_data = y_ref.mutable_data<float>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
 
   for (int i = 0; i < x_cpu.numel(); ++i) {
     x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
   }
   for (int i = 0; i < y_cpu.numel(); ++i) {
-    y_cpu_data[i] = i;
+    y_cpu_data[i] = i - 5.0;
+    y_ref_data[i] = i - 5.0;
   }
 
   x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
@@ -139,7 +226,7 @@ TEST(elementwise_add, int8_out) {
   param.X = &x;
   param.Y = &y;
   param.Out = &out;
-  param.output_scale = 50 / 127.;
+  param.axis = -1;
   elementwise_add_kernel.SetParam(param);
 
   cudaStream_t stream;
@@ -147,16 +234,15 @@ TEST(elementwise_add, int8_out) {
   context.SetExecStream(stream);
 
   elementwise_add_kernel.SetContext(std::move(ctx));
-  auto start = GetCurrentUS();
-  for (int i = 0; i < 1000000; i++) {
-    elementwise_add_kernel.Launch();
-  }
-  LOG(INFO) << "time: " << (GetCurrentUS() - start) / 1000000.;
+  elementwise_add_kernel.Launch();
+  cudaDeviceSynchronize();
 
   CopySync<TARGET(kCUDA)>(
-      out_cpu_data, out_data, sizeof(int8_t) * out.numel(), IoDirection::DtoH);
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  ElementwiseBroadcastRef(
+      x_ref_data, y_ref_data, out_ref_data, n * h * w, c, 1);
   for (int i = 0; i < out.numel(); i++) {
-    //    LOG(INFO) << float(out_cpu_data[i]);
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
   }
 }
 
diff --git a/lite/kernels/cuda/feed_compute.cc b/lite/kernels/cuda/feed_compute.cc
index cffa8a573d9b12b52ae1448632a56e40cea35b95..e54c5b9b035ab63c1356343ec671f5e968fd479b 100644
--- a/lite/kernels/cuda/feed_compute.cc
+++ b/lite/kernels/cuda/feed_compute.cc
@@ -20,21 +20,22 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-void FeedCompute::Run() {
-  auto& param = this->Param<param_t>();
+template <typename T, PrecisionType Ptype>
+void FeedCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
   auto stream = ctx.exec_stream();
   VLOG(4) << "feed_list.size: " << param.feed_list->size();
   const lite::Tensor& feed_item = (*param.feed_list)[param.col];
 
   int num = static_cast<int>(feed_item.numel());
-  auto input = feed_item.data<float>();
+  auto input = feed_item.data<T>();
   param.out->Resize(feed_item.dims());
-  auto output = param.out->mutable_data<float>(TARGET(kCUDA));
+  auto output = param.out->template mutable_data<T>(TARGET(kCUDA));
   VLOG(4) << "col: " << param.col << " num:" << num;
 
   TargetW::MemcpyAsync(
-      output, input, num * sizeof(float), IoDirection::HtoD, stream);
+      output, input, num * sizeof(T), IoDirection::HtoD, stream);
 }
 
 }  // namespace cuda
@@ -42,8 +43,13 @@ void FeedCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    feed, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::FeedCompute, nchw)
+typedef paddle::lite::kernels::cuda::FeedCompute<float, PRECISION(kFloat)>
+    FeedFp32;
+
+typedef paddle::lite::kernels::cuda::FeedCompute<int64_t, PRECISION(kInt64)>
+    FeedInt64;
+
+REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNCHW, FeedFp32, nchw)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
@@ -54,8 +60,7 @@ REGISTER_LITE_KERNEL(
                                        DATALAYOUT(kNCHW))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(
-    feed, kCUDA, kFloat, kNHWC, paddle::lite::kernels::cuda::FeedCompute, nhwc)
+REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNHWC, FeedFp32, nhwc)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
@@ -65,3 +70,25 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNCHW, FeedInt64, nchw)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kInt64),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNHWC, FeedInt64, nhwc)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kInt64),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/cuda/feed_compute.h b/lite/kernels/cuda/feed_compute.h
index 0510404b2b6ad6c50f69c847bf833afbcfe59b99..9c42dcc1ca847ccbd58c0a578a969c4d77ec1bf1 100644
--- a/lite/kernels/cuda/feed_compute.h
+++ b/lite/kernels/cuda/feed_compute.h
@@ -20,7 +20,8 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-class FeedCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+template <typename T, PrecisionType Ptype>
+class FeedCompute : public KernelLite<TARGET(kCUDA), Ptype> {
  public:
   using param_t = operators::FeedParam;
   using TargetW = TargetWrapper<TARGET(kCUDA)>;
diff --git a/lite/kernels/cuda/layout_compute.cc b/lite/kernels/cuda/layout_compute.cc
index e2d0ae4f2ef10b29247a2f823988e8098aa33795..17462a5015142540e7b1d5cb9eb1e74acd9621b5 100644
--- a/lite/kernels/cuda/layout_compute.cc
+++ b/lite/kernels/cuda/layout_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/cuda/layout_compute.h"
+#include <vector>
 #include "lite/backends/cuda/math/transpose.h"
 #include "lite/core/op_registry.h"
 
@@ -21,11 +22,33 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
+inline DDim trim_singular_dims(const DDim& dims) {
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (size_t i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  return DDim(trim_dims);
+}
+
 #define NCHWTONHWC(type)                                                  \
   auto& param = this->template Param<param_t>();                          \
   auto& ctx = this->ctx_->template As<CUDAContext>();                     \
+  auto stream = ctx.exec_stream();                                        \
   auto input = param.x->template data<type>();                            \
   auto input_dim = param.x->dims();                                       \
+  DDim input_trim_dim = trim_singular_dims(input_dim);                    \
+  if (input_trim_dim.size() == 1) {                                       \
+    param.y->CopyDataFrom(*param.x);                                      \
+    return;                                                               \
+  }                                                                       \
   CHECK(input_dim.size() == 4)                                            \
       << "NCHW to NHWC should guarantee that the input dims should be 4"; \
   int n = input_dim[0];                                                   \
@@ -34,13 +57,19 @@ namespace cuda {
   int w = input_dim[3];                                                   \
   param.y->Resize({n, h, w, c});                                          \
   auto output = param.y->template mutable_data<type>(TARGET(kCUDA));      \
-  lite::cuda::math::NCHW2NHWC<type>(n, c, h * w, input, output, &ctx);
+  trans.NCHW2NHWC(n, c, h* w, input, output, &stream);
 
 #define NHWCTONCHW(type)                                                  \
   auto& param = this->template Param<param_t>();                          \
   auto& ctx = this->ctx_->template As<CUDAContext>();                     \
+  auto stream = ctx.exec_stream();                                        \
   auto input = param.x->template data<type>();                            \
   auto input_dim = param.x->dims();                                       \
+  DDim input_trim_dim = trim_singular_dims(input_dim);                    \
+  if (input_trim_dim.size() == 1) {                                       \
+    param.y->CopyDataFrom(*param.x);                                      \
+    return;                                                               \
+  }                                                                       \
   CHECK(input_dim.size() == 4)                                            \
       << "NHWC to NCHW should guarantee that the input dims should be 4"; \
   int n = input_dim[0];                                                   \
@@ -49,7 +78,7 @@ namespace cuda {
   int c = input_dim[3];                                                   \
   param.y->Resize({n, c, h, w});                                          \
   auto output = param.y->template mutable_data<type>(TARGET(kCUDA));      \
-  lite::cuda::math::NHWC2NCHW<type>(n, c, h * w, input, output, &ctx);
+  trans.NHWC2NCHW(n, c, h* w, input, output, &stream);
 
 void NCHWToNHWCCompute::Run() { NCHWTONHWC(float) }
 
diff --git a/lite/kernels/cuda/layout_compute.h b/lite/kernels/cuda/layout_compute.h
index 10a0961212dde34a35dcc43b07bc0207ed2c93a3..634f73038e5a9a7a215af89278e786055426b8c0 100644
--- a/lite/kernels/cuda/layout_compute.h
+++ b/lite/kernels/cuda/layout_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include "lite/backends/cuda/math/transpose.h"
 #include "lite/core/kernel.h"
 
 namespace paddle {
@@ -25,6 +26,9 @@ class NCHWToNHWCCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
   using param_t = operators::LayoutParam;
   void Run() override;
   virtual ~NCHWToNHWCCompute() = default;
+
+ private:
+  lite::cuda::math::Transpose<float> trans;
 };
 
 class NCHWToNHWCComputeInt8
@@ -33,6 +37,9 @@ class NCHWToNHWCComputeInt8
   using param_t = operators::LayoutParam;
   void Run() override;
   virtual ~NCHWToNHWCComputeInt8() = default;
+
+ private:
+  lite::cuda::math::Transpose<int8_t> trans;
 };
 
 class NHWCToNCHWCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
@@ -40,6 +47,9 @@ class NHWCToNCHWCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
   using param_t = operators::LayoutParam;
   void Run() override;
   virtual ~NHWCToNCHWCompute() = default;
+
+ private:
+  lite::cuda::math::Transpose<float> trans;
 };
 
 class NHWCToNCHWComputeInt8
@@ -48,6 +58,9 @@ class NHWCToNCHWComputeInt8
   using param_t = operators::LayoutParam;
   void Run() override;
   virtual ~NHWCToNCHWComputeInt8() = default;
+
+ private:
+  lite::cuda::math::Transpose<int8_t> trans;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.cu b/lite/kernels/cuda/match_matrix_tensor_compute.cu
index 751bcb03cab5038ae7398b7d1573f8637be60e3a..0458bb4e8ebc6a333e17c502f03287fe2516c37d 100644
--- a/lite/kernels/cuda/match_matrix_tensor_compute.cu
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include <vector>
 #include "lite/core/op_registry.h"
 #include "lite/kernels/cuda/match_matrix_tensor_compute.h"
@@ -20,6 +21,54 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;
 
+template <typename dtype>
+void gpu_transpose(
+    cublasHandle_t handle, const dtype* src, int M, int N, dtype* dst);
+
+template <>
+void gpu_transpose<float>(
+    cublasHandle_t handle, const float* src, int M, int N, float* dst) {
+  float alpha = 1.0;
+  float beta = 0.0;
+  CUBLAS_CHECK(cublasSgeam(handle,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           M,
+                           N,
+                           &alpha,
+                           src,
+                           N,
+                           &beta,
+                           dst,
+                           M,
+                           dst,
+                           M));
+}
+
+template <typename dtype>
+__global__ void padding_out(const dtype* src,
+                            const int* offset,
+                            const int seq_num_r,
+                            const int max_len_r,
+                            const int tl,
+                            const int count,
+                            dtype* dst) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int thread_num = blockDim.x * gridDim.x;
+  for (tid = threadIdx.x + blockIdx.x * blockDim.x; tid < count;
+       tid += thread_num) {
+    int seq_id = tid / (tl * max_len_r);
+    int tl_id = (tid / (max_len_r)) % tl;
+    int r_id = tid % max_len_r;
+    int cur_len = offset[seq_id + 1] - offset[seq_id];
+    if (r_id < cur_len) {
+      dst[tid] = src[(offset[seq_id] + r_id) * tl + tl_id];
+    } else {
+      dst[tid] = 0.f;
+    }
+  }
+}
+
 void MatchMatrixTensorCompute::PrepareForRun() {
   gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
 }
@@ -28,6 +77,7 @@ void MatchMatrixTensorCompute::Run() {
   CHECK(ctx_) << "running context should be set first";
   auto& param = this->Param<param_t>();
   auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
 
   auto* x = param.x;
   auto* w = param.w;
@@ -39,52 +89,74 @@ void MatchMatrixTensorCompute::Run() {
 
   const auto& offset_l = x->lod()[0];
   const auto& offset_r = y->lod()[0];
+  std::vector<int> offset_r_int(offset_r.size());
+  std::transform(offset_r.begin(),
+                 offset_r.end(),
+                 offset_r_int.begin(),
+                 [](int64_t x) -> int { return static_cast<int>(x); });
 
-  std::vector<size_t> top_offset;
-  int top_size = 0;
-  top_offset.push_back(top_size);
-  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-    int len_l = offset_l[b + 1] - offset_l[b];
-    int len_r = offset_r[b + 1] - offset_r[b];
-    top_size += dim_t * len_l * len_r;
-    top_offset.push_back(top_size);
+  int batch = offset_r.size() - 1;
+  int len_l = offset_l[1] - offset_l[0];
+  for (int i = 1; i < offset_l.size() - 1; i++) {
+    int cur_len = offset_l[i + 1] - offset_l[i];
+    CHECK_EQ(cur_len, len_l)
+        << "each sequence of left matrix is the same length";
+  }
+  int max_len_r = 0;
+  for (int i = 0; i < offset_r.size() - 1; ++i) {
+    int cur_len = offset_r[i + 1] - offset_r[i];
+    max_len_r = cur_len > max_len_r ? cur_len : max_len_r;
   }
 
-  auto* bottom_l_data = x->data<float>();
-  auto* bottom_r_data = y->data<float>();
-  auto* t_data = w->data<float>();
-  auto* out_data = out->mutable_data<float>(TARGET(kCUDA));
-  auto* bottom_l_trans_data = tmp->mutable_data<float>(TARGET(kCUDA));
+  _input_l_transform.Resize({batch, dim_t, dim_in, len_l});
+  _input_l_transform_reorganize.Resize({batch, dim_t, len_l, dim_in});
+  _output_tmp.Resize({batch, max_len_r, dim_t, len_l});
+  out->Resize({batch, dim_t, len_l, max_len_r});
+
+  _offset_r.Resize({static_cast<int64_t>(offset_r.size())});
+  TargetWrapperCuda::MemcpyAsync(_offset_r.mutable_data<int>(TARGET(kCUDA)),
+                                 &offset_r_int[0],
+                                 sizeof(int) * offset_r.size(),
+                                 IoDirection::HtoD,
+                                 stream);
 
-  gemm_impl_->init(
-      false, false, x->dims()[0], dim_t * dim_in, dim_in, &context);
+  int len_r = offset_r[offset_r.size() - 1];
+  const float* input_l = x->data<float>();
+  const float* input_r = y->data<float>();
+  const float* weight_data = w->data<float>();
+  float* input_l_transform =
+      _input_l_transform.mutable_data<float>(TARGET(kCUDA));
+  float* input_l_transform_reorganize =
+      _input_l_transform_reorganize.mutable_data<float>(TARGET(kCUDA));
+  float* output_tmp = _output_tmp.mutable_data<float>(TARGET(kCUDA));
+  float* out_data = out->mutable_data<float>(TARGET(kCUDA));
+
+  gemm_impl_->init(true, true, dim_t * dim_in, len_l, dim_in, &context);
   gemm_impl_->run(
-      1.0f, 0.0f, bottom_l_data, t_data, bottom_l_trans_data, &context);
-
-  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-    for (int t = 0; t < dim_t; t++) {
-      int len_l = offset_l[b + 1] - offset_l[b];
-      int len_r = offset_r[b + 1] - offset_r[b];
-      auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
-      const auto* l_t_data =
-          bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
-      const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
-
-      gemm_impl_->init(false,
-                       true,
-                       len_l,
-                       len_r,
-                       dim_in,
-                       dim_t * dim_in,
-                       dim_in,
-                       len_r,
-                       &context);
-      gemm_impl_->run(1.0f, 0.0f, l_t_data, r_data, top_data, &context);
-    }
+      1.0f, 0.0f, weight_data, input_l, input_l_transform, &context);
+  for (int i = 0; i < dim_t; ++i) {
+    int offset = i * dim_in * len_l;
+    gpu_transpose(gemm_impl_->get_handle(),
+                  input_l_transform + offset,
+                  dim_in,
+                  len_l,
+                  input_l_transform_reorganize + offset);
   }
-  LoD out_lod;
-  out_lod.push_back(top_offset);
-  out->set_lod(out_lod);
+  gemm_impl_->init(false, true, len_r, dim_t * len_l, dim_in, &context);
+  gemm_impl_->run(
+      1.0f, 0.0f, input_r, input_l_transform_reorganize, output_tmp, &context);
+  int seq_num = offset_r.size() - 1;
+  int count = seq_num * max_len_r * dim_t * len_l;
+  const int blocks = 512;
+  const int grids = (count + blocks - 1) / blocks;
+  padding_out<float><<<grids, blocks, 0, stream>>>(_output_tmp.data<float>(),
+                                                   _offset_r.data<int>(),
+                                                   seq_num,
+                                                   max_len_r,
+                                                   dim_t * len_l,
+                                                   count,
+                                                   out_data);
+  out->set_lod(y->lod());
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.h b/lite/kernels/cuda/match_matrix_tensor_compute.h
index 09db326ff3e992363e9b572ca91444499caed20f..d5fe8885f2c4e078832822cc5759a6882271d2d6 100644
--- a/lite/kernels/cuda/match_matrix_tensor_compute.h
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.h
@@ -34,6 +34,10 @@ class MatchMatrixTensorCompute
 
  private:
   std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
+  lite::Tensor _input_l_transform;
+  lite::Tensor _input_l_transform_reorganize;
+  lite::Tensor _output_tmp;
+  lite::Tensor _offset_r;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/mul_compute.h b/lite/kernels/cuda/mul_compute.h
index c2fc4364ef77742858b143734d2ecf4d13e201e9..320b562128583f7393ca3e1edb3e8bc1c30136ec 100644
--- a/lite/kernels/cuda/mul_compute.h
+++ b/lite/kernels/cuda/mul_compute.h
@@ -93,7 +93,6 @@ class MulCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
             .Slice(param.y_num_col_dims, param.y->dims().size())
             .production());
     CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
-    LOG(INFO) << x_h << " " << x_w << " " << y_h << " " << y_w;
 
     mul_compute<float>(blas, x_data, x_h, x_w, y_data, y_h, y_w, out_data);
   }
diff --git a/lite/kernels/cuda/mul_compute_test.cc b/lite/kernels/cuda/mul_compute_test.cc
index d1c1d63e7dcd46f84cd128fc5b855da2098e179d..f521a12e2dddcf854b3982ae37f4da7631f6acf3 100644
--- a/lite/kernels/cuda/mul_compute_test.cc
+++ b/lite/kernels/cuda/mul_compute_test.cc
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
+#include "lite/backends/cuda/blas.h"
 
 namespace paddle {
 namespace lite {
@@ -26,6 +27,7 @@ TEST(mul_compute, normal) {
   MulCompute mul_kernel;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   auto& context = ctx->As<CUDAContext>();
+  context.InitOnce();
 
   Tensor x, y, out, x_cpu, y_cpu, out_cpu;
   int x_h = 2, x_w_y_h = 3, y_w = 4;
diff --git a/lite/kernels/cuda/pool_compute.cu b/lite/kernels/cuda/pool_compute.cu
index a2483a2c759e8acc5f5944fd316c83bb49530d36..d7e3739ddbb59a624e1911b8178e96053dacc0d1 100644
--- a/lite/kernels/cuda/pool_compute.cu
+++ b/lite/kernels/cuda/pool_compute.cu
@@ -256,6 +256,7 @@ void PoolCompute::Run() {
   bool adaptive = param.adaptive;
   auto x_dims = param.x->dims();
   auto out_dims = param.output->dims();
+  auto paddings = *param.paddings;
   const int in_h = x_dims[2];
   const int in_w = x_dims[3];
   const int out_h = out_dims[2];
@@ -266,8 +267,8 @@ void PoolCompute::Run() {
   const int win_w = param.ksize[1];
   const int stride_h = param.strides[0];
   const int stride_w = param.strides[1];
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
   const int total_threads = out_dims.production();
   const int threads = 512;
   const int blocks = (total_threads + threads - 1) / threads;
@@ -357,6 +358,61 @@ void PoolCompute::Run() {
   if (error != cudaSuccess) LOG(FATAL) << cudaGetErrorString(error);
 }
 
+inline int PoolOutputSize(
+    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+  }
+  return output_size;
+}
+
+void PoolComputeNHWC::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  pool_impl_.reset(new lite::cuda::math::CudnnPool2DNHWC<PRECISION(kFloat)>);
+  pool_impl_->init(param, &ctx);
+}
+
+void PoolComputeNHWC::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  const auto x_dims = param.x->dims();
+  std::vector<int>& ksize = param.ksize;
+  if (param.global_pooling) {
+    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      (*param.paddings)[i] = 0;
+      ksize[i] = static_cast<int>(x_dims[i + 1]);
+    }
+  }
+
+  std::vector<int64_t> output_shape({x_dims[0]});
+  if (param.adaptive) {
+    output_shape.insert(
+        output_shape.end(), param.ksize.begin(), param.ksize.end());
+  } else {
+    for (size_t i = 0; i < param.ksize.size(); ++i) {
+      output_shape.push_back(PoolOutputSize(x_dims[i + 1],
+                                            param.ksize[i],
+                                            (*param.paddings)[i],
+                                            param.strides[i],
+                                            param.ceil_mode));
+    }
+  }
+  output_shape.push_back(x_dims[3]);
+  param.output->Resize(lite::DDim(output_shape));
+
+  pool_impl_->run(param);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(FATAL) << cudaGetErrorString(error);
+}
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
@@ -373,3 +429,19 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNCHW))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(pool2d,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::PoolComputeNHWC,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/cuda/pool_compute.h b/lite/kernels/cuda/pool_compute.h
index 55b346bfaf4ac139c8d22bff2ac64f0e78bc6023..5c3a1bc2b93d3a03a40515fff6f14e604a11c0a1 100644
--- a/lite/kernels/cuda/pool_compute.h
+++ b/lite/kernels/cuda/pool_compute.h
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
+#include <vector>
+#include "lite/backends/cuda/math/cudnn_pool.h"
 #include "lite/core/kernel.h"
 
 namespace paddle {
@@ -29,6 +32,20 @@ class PoolCompute
   virtual ~PoolCompute() = default;
 };
 
+class PoolComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::PoolParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~PoolComputeNHWC() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::CudnnPool2DNHWC<PRECISION(kFloat)>>
+      pool_impl_;
+};
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/cuda/pool_compute_test.cc b/lite/kernels/cuda/pool_compute_test.cc
index fe6ff92c0ce943cad36fbdd4f1408e344d9fd5fd..0e5aeec8c0133f1f61b469437e3e9a602096133f 100644
--- a/lite/kernels/cuda/pool_compute_test.cc
+++ b/lite/kernels/cuda/pool_compute_test.cc
@@ -27,42 +27,123 @@ namespace cuda {
 using Tensor = lite::Tensor;
 using DDim = lite::DDim;
 
-static int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+#define IN(n, c, h, w)                                 \
+  input_data[w + h * input_w + c * input_h * input_w + \
+             n * input_c * input_h * input_w]
+#define OUT(n, c, h, w)                                    \
+  output_data[w + h * output_w + c * output_h * output_w + \
+              n * output_c * output_h * output_w]
+
+template <typename Dtype>
+void nchw2nhwc_ref(lite::Tensor* input, lite::Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_c = input->dims()[1];
+  int input_h = input->dims()[2];
+  int input_w = input->dims()[3];
+  int output_c = output->dims()[1];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, h, w, c) = IN(n, c, h, w);
+        }
+      }
+    }
+  }
+}
+
+#undef IN
+#undef OUT
+
+#define IN(n, h, w, c)                                 \
+  input_data[c + w * input_c + h * input_w * input_c + \
+             n * input_h * input_w * input_c]
+#define OUT(n, h, w, c)                                    \
+  output_data[c + w * output_c + h * output_w * output_c + \
+              n * output_h * output_w * output_c]
+
+template <typename Dtype>
+void nhwc2nchw_ref(lite::Tensor* input, lite::Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_h = input->dims()[1];
+  int input_w = input->dims()[2];
+  int input_c = input->dims()[3];
+  int output_h = output->dims()[1];
+  int output_w = output->dims()[2];
+  int output_c = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, c, h, w) = IN(n, h, w, c);
+        }
+      }
+    }
+  }
+}
+
+static int PoolOutputSize(int input_size,
+                          int filter_size,
+                          int pad_left,
+                          int pad_right,
+                          int stride,
+                          bool ceil_mode) {
   int output_size;
   if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
   } else {
     output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
   }
   return output_size;
 }
 
-static std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
+static std::vector<int64_t> compute_output_shape(operators::PoolParam* param_,
+                                                 bool is_nchw) {
+  int axis = 2;
+  if (!is_nchw) axis = 1;
   const auto x_dims = param_->x->dims();
   std::vector<int>& ksize = param_->ksize;
   if (param_->global_pooling) {
     ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
+    auto paddings = *param_->paddings;
     for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
       ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
 
-  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
+  std::vector<int64_t> output_shape({x_dims[0]});
+  if (is_nchw) output_shape.push_back(x_dims[1]);
   if (param_->adaptive) {
     output_shape.insert(
         output_shape.end(), param_->ksize.begin(), param_->ksize.end());
   } else {
+    auto paddings = *param_->paddings;
     for (size_t i = 0; i < param_->ksize.size(); ++i) {
-      output_shape.push_back(PoolOutputSize(x_dims[i + 2],
+      output_shape.push_back(PoolOutputSize(x_dims[i + axis],
                                             param_->ksize[i],
-                                            param_->paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                             param_->strides[i],
                                             param_->ceil_mode));
     }
   }
+  if (!is_nchw) output_shape.push_back(x_dims[3]);
   return output_shape;
 }
 
@@ -75,7 +156,7 @@ static void pool_compute_ref(const operators::PoolParam& param) {
 
   std::vector<int> ksize = param.ksize;
   std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
+  std::vector<int> paddings = *param.paddings;
 
   std::string pooling_type = param.pooling_type;
   bool global_pooling = param.global_pooling;
@@ -99,7 +180,7 @@ static void pool_compute_ref(const operators::PoolParam& param) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
 
   if (global_pooling == true) {
     for (int n = 0; n < in_n; ++n) {
@@ -195,15 +276,15 @@ TEST(pool_cuda, compute) {
               for (auto pad : {0, 1}) {
                 for (auto n : {1, 2}) {
                   for (auto c : {1, 3}) {
-                    for (auto h : {2, 3, 4, 11}) {
-                      for (auto w : {2, 3, 4, 11}) {
-                        VLOG(3) << "n:" << n << " c:" << c << " h:" << h
-                                << " w:" << w << " ksize:" << ksize
-                                << " stride:" << stride << " pad:" << pad
-                                << " exclusive:" << exclusive
-                                << " global_pooling:" << global_pooling
-                                << " ceil_mode: " << ceil_mode
-                                << " pooling_type:" << pooling_type;
+                    for (auto h : {3}) {
+                      for (auto w : {3}) {
+                        LOG(INFO) << "n:" << n << " c:" << c << " h:" << h
+                                  << " w:" << w << " ksize:" << ksize
+                                  << " stride:" << stride << " pad:" << pad
+                                  << " exclusive:" << exclusive
+                                  << " global_pooling:" << global_pooling
+                                  << " ceil_mode: " << ceil_mode
+                                  << " pooling_type:" << pooling_type;
 
                         // init x, output
                         x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
@@ -226,14 +307,16 @@ TEST(pool_cuda, compute) {
                         }
                         param.global_pooling = global_pooling;
                         param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
+                        std::vector<int> paddings = {pad, pad, pad, pad};
+                        param.paddings =
+                            std::make_shared<std::vector<int>>(paddings);
                         param.exclusive = exclusive;
                         param.ceil_mode = ceil_mode;
                         param.adaptive = false;
                         param.use_quantizer = false;
 
                         const std::vector<int64_t>& output_shape =
-                            compute_output_shape(&param);
+                            compute_output_shape(&param, true);
                         if (output_shape[2] * output_shape[3] == 0) continue;
                         output.Resize(DDim(output_shape));
                         output_ref.Resize(DDim(output_shape));
@@ -277,6 +360,131 @@ TEST(pool_cuda, compute) {
     }
   }
 }
+
+TEST(pool_cuda, nhwc) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  PoolComputeNHWC pool;
+  operators::PoolParam param;
+  pool.SetContext(std::move(ctx));
+
+  lite::Tensor x, temp;
+  lite::Tensor x_cpu;
+  lite::Tensor output;
+  lite::Tensor output_cpu, output_temp;
+  lite::Tensor output_ref;
+  for (auto pooling_type : {"max", "avg"}) {
+    for (auto ceil_mode : {false}) {
+      for (auto global_pooling : {true, false}) {
+        for (auto exclusive : {false, true}) {
+          for (auto ksize : {3}) {
+            for (auto stride : {3}) {
+              for (auto pad : {1}) {
+                for (auto n : {1}) {
+                  for (auto c : {3}) {
+                    for (auto h : {8}) {
+                      for (auto w : {8}) {
+                        LOG(INFO) << "n:" << n << " c:" << c << " h:" << h
+                                  << " w:" << w << " ksize:" << ksize
+                                  << " stride:" << stride << " pad:" << pad
+                                  << " exclusive:" << exclusive
+                                  << " global_pooling:" << global_pooling
+                                  << " ceil_mode: " << ceil_mode
+                                  << " pooling_type:" << pooling_type;
+
+                        // init x, output
+                        x.Resize(DDim(std::vector<int64_t>({n, h, w, c})));
+                        temp.Resize(DDim(std::vector<int64_t>({n, h, w, c})));
+                        x_cpu.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
+
+                        auto* x_cpu_data = x_cpu.mutable_data<float>();
+                        for (int i = 0; i < x_cpu.dims().production(); ++i) {
+                          float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                          x_cpu_data[i] = sign * (i % 128);
+                        }
+
+                        nchw2nhwc_ref<float>(&x_cpu, &temp);
+                        auto* temp_cpu_data = temp.mutable_data<float>();
+
+                        x.Assign<float, DDim, TARGET(kCUDA)>(temp_cpu_data,
+                                                             temp.dims());
+                        // fill param
+                        param.x = &x;
+                        param.output = &output;
+                        param.pooling_type = pooling_type;
+                        if (global_pooling) {
+                          param.ksize = {h, w};
+                        } else {
+                          param.ksize = {ksize, ksize};
+                        }
+                        param.global_pooling = global_pooling;
+                        param.strides = {stride, stride};
+                        std::vector<int> paddings = {pad, pad, pad, pad};
+                        param.paddings =
+                            std::make_shared<std::vector<int>>(paddings);
+                        param.exclusive = exclusive;
+                        param.ceil_mode = ceil_mode;
+                        param.adaptive = false;
+                        param.use_quantizer = false;
+
+                        const std::vector<int64_t>& output_shape =
+                            compute_output_shape(&param, false);
+                        if (output_shape[2] * output_shape[3] == 0) continue;
+                        output.Resize(DDim(output_shape));
+                        output_temp.Resize(DDim(output_shape));
+                        output_cpu.Resize(DDim(output_shape));
+
+                        auto* output_data =
+                            output.mutable_data<float>(TARGET(kCUDA));
+                        auto* output_cpu_data =
+                            output_cpu.mutable_data<float>();
+
+                        // compute
+                        pool.SetParam(param);
+                        pool.Launch();
+
+                        // compute ref
+                        param.x = &x_cpu;
+                        // nchw
+                        const std::vector<int64_t>& output_shape_ref =
+                            compute_output_shape(&param, true);
+
+                        output_ref.Resize(DDim(output_shape_ref));
+                        // auto* output_ref_data =
+                        //    output_ref.mutable_data<float>();
+                        param.output = &output_ref;
+                        pool_compute_ref(param);
+                        nchw2nhwc_ref<float>(&output_ref, &output_temp);
+                        auto* output_temp_data =
+                            output_temp.mutable_data<float>();
+
+                        cudaDeviceSynchronize();
+                        CopySync<TARGET(kCUDA)>(output_cpu_data,
+                                                output_data,
+                                                sizeof(float) * output.numel(),
+                                                IoDirection::DtoH);
+                        // compare
+                        for (int i = 0; i < output.dims().production(); i++) {
+                          EXPECT_NEAR(
+                              output_cpu_data[i], output_temp_data[i], 1e-4);
+                        }
+                        VLOG(3) << "compare pass";
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute.cc b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc
index 525765de283011535fbe154e31eb0afa2dee0daf..ddefb608dd233279b4a8127b100151acf8ffc8e6 100644
--- a/lite/kernels/cuda/search_aligned_mat_mul_compute.cc
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc
@@ -32,4 +32,7 @@ REGISTER_LITE_KERNEL(search_aligned_mat_mul,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute.h b/lite/kernels/cuda/search_aligned_mat_mul_compute.h
index b1c4552d9c43e2dcbc3bf0211f7028811410cb6c..8304b0f2b43d4114def029e32aa9086fc29199a4 100644
--- a/lite/kernels/cuda/search_aligned_mat_mul_compute.h
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.h
@@ -31,21 +31,12 @@ class SearchAlignedMatMulCompute
   using param_t = operators::MatMulParam;
 
   void PrepareForRun() override {
-    auto& param = this->Param<param_t>();
-    CHECK(ctx_) << "running context should be set first";
-    auto& cuda_ctx = ctx_->template As<CUDAContext>();
-    bool x_transpose = param.transpose_X;
-    bool y_transpose = param.transpose_Y;
-    int seq_num = param.X->lod()[0].size() - 1;
     batched_gemm_impl_.reset(new lite::cuda::math::BatchedGemm<float, float>);
-    CHECK(
-        batched_gemm_impl_->init(x_transpose, y_transpose, seq_num, &cuda_ctx));
-    A_ = static_cast<float**>(malloc(3 * seq_num * sizeof(float*)));
-    CHECK(A_);
   }
 
   void Run() override {
     auto& param = this->Param<param_t>();
+    auto& cuda_ctx = ctx_->template As<CUDAContext>();
     auto x = param.X;
     auto y = param.Y;
     auto out = param.Out;
@@ -76,25 +67,25 @@ class SearchAlignedMatMulCompute
     auto x_stride = x_batch_size * x_inner_size;
     auto y_stride = y_batch_size * y_inner_size;
     auto out_stride = M * N;
-    for (int seq = 0; seq < seq_num; seq++) {
+
+    float* A_[seq_num * 3];
+    for (int seq = 0; seq < seq_num; ++seq) {
       A_[seq] = const_cast<float*>(x_data) + seq * x_stride;
       A_[seq + seq_num] = const_cast<float*>(y_data) + seq * y_stride;
       A_[seq + seq_num * 2] = out_data + seq * out_stride;
     }
+
+    CHECK(
+        batched_gemm_impl_->init(x_transpose, y_transpose, seq_num, &cuda_ctx));
     batched_gemm_impl_->run(
         alpha, 0.0f, const_cast<const float**>(A_), M, N, K, seq_num);
   }
 
-  ~SearchAlignedMatMulCompute() {
-    if (A_ != nullptr) {
-      free(A_);
-    }
-  }
+  ~SearchAlignedMatMulCompute() { batched_gemm_impl_.reset(); }
 
  private:
   std::unique_ptr<lite::cuda::math::BatchedGemm<float, float>>
       batched_gemm_impl_;
-  float** A_{nullptr};
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
index 66b3478f64f1144aa09404cb943c3de49e549b0d..f08333b3103973f99d37e39e7e7babeb52b335f1 100644
--- a/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
@@ -57,7 +57,7 @@ void search_aligned_mat_mul_compute_ref(const operators::MatMulParam& param) {
   auto x_data = x->data<T>();
   auto y_data = y->data<T>();
   auto out_data = out->mutable_data<T>();
-#pragma omp parallel for
+
   for (int seq = 0; seq < seq_num; seq++) {
     auto a = x_data + seq * x_stride;
     auto b = y_data + seq * y_stride;
diff --git a/lite/kernels/cuda/search_fc_compute.cu b/lite/kernels/cuda/search_fc_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8bd5bad5404db6087dd959c065e59cbe46905dbb
--- /dev/null
+++ b/lite/kernels/cuda/search_fc_compute.cu
@@ -0,0 +1,77 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_fc_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+static __global__ void add_bias(int n,
+                                int output_size,
+                                const T* bias,
+                                T* dout) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int bias_index = index % output_size;
+  if (index < n) {
+    dout[index] = dout[index] + bias[bias_index];
+  }
+}
+
+template <typename T>
+void SearchFcCompute<T>::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+}
+
+template <typename T>
+void SearchFcCompute<T>::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  const Tensor* x_tensor = param.X;
+  param.Out->Resize({x_tensor->dims()[0], param.out_size});
+  _M = x_tensor->dims().count(0, 1);
+  _K = x_tensor->dims().count(1, x_tensor->numel());
+  _N = param.out_size;
+  const T* din = x_tensor->data<T>();
+  Tensor* out_tensor = param.Out;
+  T* dout = out_tensor->mutable_data<T>(TARGET(kCUDA));
+  const Tensor* w_tensor = param.W;
+  const T* weight = w_tensor->data<T>();
+  const Tensor* b_tensor = param.b;
+  const T* bias = b_tensor->data<T>();
+
+  CHECK(gemm_impl_->init(false, true, _M, _N, _K, &ctx));
+  gemm_impl_->run(1.0f, 0.0f, din, weight, dout, &ctx);
+
+  int total_size = _M * _N;
+  add_bias<T><<<CUDA_GET_BLOCKS(total_size), CUDA_NUM_THREADS, 0, stream>>>(
+      total_size, _N, bias, dout);
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_fc,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchFcCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/x86/relu_compute.h b/lite/kernels/cuda/search_fc_compute.h
similarity index 52%
rename from lite/kernels/x86/relu_compute.h
rename to lite/kernels/cuda/search_fc_compute.h
index b80a99302ad31182e659bf62de8ff367aadca7bc..a551486cba9dca9f9bb567ce8fcf76af27a64a10 100644
--- a/lite/kernels/x86/relu_compute.h
+++ b/lite/kernels/cuda/search_fc_compute.h
@@ -11,42 +11,43 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#pragma once
 
-#include <Eigen/Core>
-#include <algorithm>
+#pragma once
+#include <cudnn.h>
+#include <memory>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/gemm.h"
 #include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-#include "lite/operators/relu_op.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace x86 {
+namespace cuda {
+
+const int CUDA_NUM_THREADS = 512;
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+inline int CUDA_GET_BLOCKS(const int N, const int base) {
+  return (N + base - 1) / base;
+}
 
 template <typename T>
-class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+class SearchFcCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
  public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto n = param.X->dims().production();
-    const float* input = param.X->data<float>();
-    float* output = param.Out->mutable_data<float>();
-    for (int i = 0; i < n; i++) {
-      output[i] = std::max(0.f, input[i]);
-    }
-  }
+  using param_t = operators::SearchFcParam;
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~SearchFcCompute() = default;
 
-  virtual ~ReluCompute() = default;
+ private:
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_{nullptr};
+  int _M;
+  int _K;
+  int _N;
 };
 
-}  // namespace x86
+}  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/cuda/search_fc_compute_test.cc b/lite/kernels/cuda/search_fc_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f06028fbe15557c652c442ac436fa09700a56e28
--- /dev/null
+++ b/lite/kernels/cuda/search_fc_compute_test.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_fc_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+void fc_cpu_base(const lite::Tensor* X,
+                 const lite::Tensor* W,
+                 const lite::Tensor* b,
+                 int out_size,
+                 lite::Tensor* Out) {
+  const float* data_in = X->data<float>();
+  const float* bias = b->data<float>();
+  const float* weights = W->data<float>();
+  float* data_out = Out->mutable_data<float>();
+  int out_rows = X->dims()[0];
+  int in_cols = X->numel() / out_rows;
+  int out_cols = W->numel() / in_cols;
+  int index_out;
+
+  for (int i = 0; i < out_rows; i++) {
+    for (int j = 0; j < out_cols; j++) {
+      index_out = i * out_cols + j;
+      data_out[index_out] = bias ? bias[j] : 0;
+
+      for (int k = 0; k < in_cols; k++) {
+        data_out[index_out] +=
+            data_in[i * in_cols + k] * weights[j * in_cols + k];
+      }
+    }
+  }
+}
+
+TEST(search_fc, normal) {
+  SearchFcCompute<float> search_fc_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  operators::SearchFcParam param;
+  lite::Tensor X, X_gpu, W, W_gpu, b, b_gpu;
+  lite::Tensor Out, Out_cpu, out_ref;
+  std::vector<int64_t> x_shape{1, 4};
+  X.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> w_shape{3, 4};
+  W.Resize(lite::DDim(w_shape));
+  std::vector<int64_t> b_shape{3};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{1, 4};
+  Out.Resize(lite::DDim(out_shape));
+  out_ref.Resize(lite::DDim(out_shape));
+  auto x_data = X.mutable_data<float>();
+  auto w_data = W.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data_ref = out_ref.mutable_data<float>();
+  for (int64_t i = 0; i < X.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < W.dims().production(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = static_cast<float>(i);
+  }
+  X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(x_data, X.dims());
+  W_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(w_data, W.dims());
+  b_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(b_data, b.dims());
+  param.X = &X_gpu;
+  param.W = &W_gpu;
+  param.b = &b_gpu;
+  param.out_size = 4;
+  param.Out = &Out;
+  search_fc_kernel.SetParam(param);
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  search_fc_kernel.SetContext(std::move(ctx));
+  search_fc_kernel.Run();
+  fc_cpu_base(&X, &W, &b, 4, &out_ref);
+  cudaDeviceSynchronize();
+  const float* out_data = Out.data<float>();
+  float* out_cpu_data = Out_cpu.mutable_data<float>();
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * Out.numel(), IoDirection::DtoH);
+  for (int i = 0; i < Out.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_data[i], out_data_ref[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_grnn_compute.cu b/lite/kernels/cuda/search_grnn_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2c1cb94a14d911d282d8e365ca0b818e7992461d
--- /dev/null
+++ b/lite/kernels/cuda/search_grnn_compute.cu
@@ -0,0 +1,527 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "lite/backends/cuda/math/transpose.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_grnn_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+
+template <typename Dtype>
+__global__ void trans_map2out(
+    Dtype* output, const Dtype* input, const int* map, int count, int lastdim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < count) {
+    int seq = tid / lastdim;
+    output[map[seq] * lastdim + tid % lastdim] = input[tid];
+  }
+}
+
+template <typename Dtype>
+__global__ void trans_map2in(
+    Dtype* output, const Dtype* input, const int* map, int count, int lastdim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < count) {
+    int seq = tid / lastdim;
+    output[tid] = input[map[seq] * lastdim + tid % lastdim];
+  }
+}
+
+template <typename Dtype>
+void trans_map2out_cfunc(const Dtype* input,
+                         Dtype* output,
+                         int word_size,
+                         int seq_sum,
+                         cudaStream_t stream,
+                         int* dev_map_vec) {
+  int count = seq_sum * word_size;
+  int block_dim = count;
+  int grid_dim = 1;
+
+  if (count > 1024) {
+    block_dim = 256;
+    grid_dim = (count + block_dim - 1) / block_dim;
+  }
+
+  trans_map2out<<<grid_dim, block_dim, 0, stream>>>(
+      output, input, dev_map_vec, count, word_size);
+}
+
+template <typename Dtype>
+void trans_map2in_cfunc(const Dtype* input,
+                        Dtype* output,
+                        int hidden_size,
+                        int seq_sum,
+                        cudaStream_t stream,
+                        int* dev_map_vec) {
+  int count = seq_sum * hidden_size;
+  int block_dim = count;
+  int grid_dim = 1;
+  if (count > 1024) {
+    block_dim = 256;
+    grid_dim = (count + block_dim - 1) / block_dim;
+  }
+
+  trans_map2in<<<grid_dim, block_dim, 0, stream>>>(
+      output, input, dev_map_vec, count, hidden_size);
+}
+
+template <typename Dtype>
+void SeqSortedseqTranseUtil::seq_2_sorted_seq(const Dtype* input,
+                                              Dtype* output,
+                                              int word_size,
+                                              cudaStream_t stream) {
+  int seq_sum = _map_vec.size();
+  trans_map2out_cfunc(input, output, word_size, seq_sum, stream, _dev_map_vec);
+}
+
+template <typename Dtype>
+void SeqSortedseqTranseUtil::sorted_seq_2_seq(const Dtype* input,
+                                              Dtype* output,
+                                              int hidden_size,
+                                              cudaStream_t stream) {
+  int seq_sum = _map_vec.size();
+  trans_map2in_cfunc(input, output, hidden_size, seq_sum, stream, _dev_map_vec);
+}
+
+bool SeqSortedseqTranseUtil::get_sorted_map(const std::vector<int>& offset_vec,
+                                            cudaStream_t stream_id) {
+  int batch_size = offset_vec.size() - 1;
+  int word_sum = offset_vec[offset_vec.size() - 1];
+  std::vector<int> length_vec(batch_size);
+  _length_index.resize(batch_size);
+  int emit_length = 0;
+
+  if (batch_size == 1) {
+    emit_length = offset_vec[1] - offset_vec[0];
+    _emit_offset_vec.resize(emit_length + 1);
+
+    for (int i = 0; i <= emit_length; ++i) {
+      _emit_offset_vec[i] = i;
+    }
+
+    return false;
+  }
+
+  int max_len = 0;
+
+  for (int i = 0; i < offset_vec.size() - 1; ++i) {
+    int len = offset_vec[i + 1] - offset_vec[i];
+    max_len = max_len > len ? max_len : len;
+    length_vec[i] = len;
+    _length_index[i] = i;
+  }
+
+  emit_length = max_len;
+
+  if (max_len == 1) {
+    _emit_offset_vec.resize(2);
+    _emit_offset_vec[0] = 0;
+    _emit_offset_vec[1] = emit_length * batch_size;
+    return false;
+  }
+
+  std::sort(_length_index.begin(),
+            _length_index.end(),
+            [&length_vec](int i1, int i2) {
+              return length_vec[i1] > length_vec[i2];
+            });
+
+  _emit_offset_vec.resize(max_len + 1);
+  _map_vec.resize(word_sum);
+
+  if (word_sum > _dev_map_vec_length) {
+    if (_dev_map_vec != nullptr) {
+      TargetWrapperCuda::Free(static_cast<void*>(_dev_map_vec));
+    }
+
+    _dev_map_vec =
+        static_cast<int*>(TargetWrapperCuda::Malloc(sizeof(int) * word_sum));
+    _dev_map_vec_length = word_sum;
+  }
+
+  int target_word_id = 0;
+  std::vector<int> length_vec_cnt = length_vec;
+  int last_batch_size = batch_size;
+  for (int word_id_in_seq = 0; word_id_in_seq < max_len; word_id_in_seq++) {
+    _emit_offset_vec[word_id_in_seq] = target_word_id;
+
+    for (int batch_id = 0; batch_id < last_batch_size; batch_id++) {
+      int old_batch_id = _length_index[batch_id];
+
+      if (length_vec_cnt[old_batch_id] > 0) {
+        int inner_word_id_in_seq = word_id_in_seq;
+
+        if (_is_reverse) {
+          inner_word_id_in_seq = length_vec[old_batch_id] - 1 - word_id_in_seq;
+        }
+
+        int old_word_id = offset_vec[old_batch_id] + inner_word_id_in_seq;
+        _map_vec[old_word_id] = target_word_id;
+        length_vec_cnt[old_batch_id]--;
+        target_word_id++;
+      } else {
+        last_batch_size--;
+        break;
+      }
+    }
+  }
+
+  TargetWrapperCuda::MemcpyAsync(_dev_map_vec,
+                                 _map_vec.data(),
+                                 sizeof(int) * word_sum,
+                                 IoDirection::HtoD,
+                                 stream_id);
+  _emit_offset_vec[max_len] = word_sum;
+  _emit_length = emit_length;
+  return true;
+}
+
+template <typename Dtype>
+__global__ void transpose_2d(Dtype* output, const Dtype* input, int m, int n) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < m * n) {
+    int i = tid / n;
+    int j = tid % m;
+    output[tid] = input[j * n + i];
+  }
+}
+
+void SearchGrnnCompute::WeightsPreprocess() {
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+
+  DDim idims = param.wi->dims();
+  DDim hdims = param.wh->dims();
+  _wi.Resize({idims[2], idims[0], idims[1]});
+  _wh.Resize({hdims[2], hdims[0], hdims[1]});
+  lite::cuda::math::Transpose<float> trans;
+  trans.transpose(_wi.mutable_data<float>(TARGET(kCUDA)),
+                  param.wi->data<float>(),
+                  idims.Vectorize(),
+                  {2, 0, 1},
+                  &stream);
+  trans.transpose(_wh.mutable_data<float>(TARGET(kCUDA)) + hdims[1] * hdims[2],
+                  param.wh->data<float>() + hdims[1] * hdims[2],
+                  {hdims[0] - 1, hdims[1], hdims[2]},
+                  {2, 0, 1},
+                  &stream);
+  trans.transpose(_wh.mutable_data<float>(TARGET(kCUDA)),
+                  param.wh->data<float>(),
+                  {hdims[1], hdims[2]},
+                  {1, 0},
+                  &stream);
+
+  // int thread_num = 512;
+  // int block_num = (hdims[1] * hdims[2] + thread_num - 1) / thread_num;
+  // transpose_2d<<<block_num, thread_num, 0, stream>>>(
+  //    _wh.mutable_data<float>(TARGET(kCUDA)),
+  //    param.wh->data<float>(),
+  //    hdims[1],
+  //    hdims[2]);
+}
+
+void SearchGrnnCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+  _seq_util = SeqSortedseqTranseUtil();
+
+  WeightsPreprocess();
+
+  int hidden_size = param.num_hidden;
+  int word_size = param.num_input;
+  int weights_h2h_size = hidden_size * hidden_size * 3;
+  int weights_i2h_size = hidden_size * word_size * 3;
+
+  lite::Tensor temp_weights_h2h_ori;
+  lite::Tensor temp_weights_h2h_swarp;
+  temp_weights_h2h_ori.Resize({weights_h2h_size});
+  temp_weights_h2h_swarp.Resize({weights_h2h_size});
+
+  TargetWrapperCuda::MemcpyAsync(temp_weights_h2h_ori.mutable_data<float>(),
+                                 _wh.data<float>(),
+                                 sizeof(float) * weights_h2h_size,
+                                 IoDirection::DtoH,
+                                 stream);
+  cudaStreamSynchronize(stream);
+
+  float* temp_tensor_ptr = temp_weights_h2h_swarp.mutable_data<float>();
+  memcpy(temp_tensor_ptr,
+         temp_weights_h2h_ori.data<float>(),
+         sizeof(float) * hidden_size * hidden_size);
+
+  float* rz_temp_tensor_ptr = temp_tensor_ptr + hidden_size * hidden_size;
+  const float* rz_weights_tensor_ptr =
+      temp_weights_h2h_ori.data<float>() + hidden_size * hidden_size;
+  for (int row = 0; row < hidden_size; row++) {
+    for (int block = 0; block < 2; block++) {
+      int block_offset = block * hidden_size;
+      for (int cow = 0; cow < hidden_size; cow++) {
+        rz_temp_tensor_ptr[block * hidden_size * hidden_size +
+                           row * hidden_size + cow] =
+            rz_weights_tensor_ptr[row * (2 * hidden_size) + cow + block_offset];
+      }
+    }
+  }
+
+  float* orz_temp_tensor_ptr = temp_tensor_ptr;
+  float* orz_weights_tensor_ptr = temp_weights_h2h_ori.mutable_data<float>();
+  for (int row = 0; row < hidden_size; row++) {
+    for (int block = 0; block < 3; block++) {
+      int block_offset = block * hidden_size;
+      for (int cow = 0; cow < hidden_size; cow++) {
+        orz_weights_tensor_ptr[row * (3 * hidden_size) + cow + block_offset] =
+            orz_temp_tensor_ptr[block * hidden_size * hidden_size +
+                                row * hidden_size + cow];
+      }
+    }
+  }
+
+  _temp_weights_h2h.Resize({weights_h2h_size});
+  TargetWrapperCuda::MemcpyAsync(
+      _temp_weights_h2h.mutable_data<float>(TARGET(kCUDA)),
+      temp_weights_h2h_ori.data<float>(),
+      sizeof(float) * weights_h2h_size,
+      IoDirection::HtoD,
+      stream);
+  cudaStreamSynchronize(stream);
+}
+
+template <typename Dtype>
+static inline __device__ Dtype Sigmoid(const Dtype a) {
+  return static_cast<Dtype>(1.0) / (static_cast<Dtype>(1.0) + expf(-a));
+}
+
+template <typename Dtype>
+static inline __device__ Dtype Tanh(const Dtype a) {
+  Dtype tmp = static_cast<Dtype>(-2.0) * a;
+  return (static_cast<Dtype>(2.0) / (static_cast<Dtype>(1.0) + expf(tmp))) -
+         static_cast<Dtype>(1.0);
+}
+
+template <typename Dtype>
+__global__ void cal_cudnn_kernel(const Dtype* w_x_r,
+                                 const Dtype* w_x_z,
+                                 const Dtype* w_x_o,
+                                 const Dtype* w_h_r,
+                                 const Dtype* w_h_z,
+                                 const Dtype* w_h_o,
+                                 int hidden_size,
+                                 int batch_size,
+                                 Dtype* output,
+                                 const Dtype* hidden_pre) {
+  const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int batch_id = thread_id / hidden_size;
+  const int index = thread_id % hidden_size;
+  if (index < hidden_size && batch_id < batch_size) {
+    int w_base_index = batch_id * hidden_size * 3 + index;
+    int h_base_index = batch_id * hidden_size + index;
+    Dtype hidden_pre_value = hidden_pre[h_base_index];
+    Dtype r = Sigmoid(w_x_r[w_base_index] + w_h_r[w_base_index]);
+    Dtype z = Sigmoid(w_x_z[w_base_index] + w_h_z[w_base_index]);
+    Dtype _h = Tanh(w_x_o[w_base_index] + w_h_o[w_base_index] * r);
+
+    output[h_base_index] =
+        (static_cast<Dtype>(1.0) - z) * _h + z * hidden_pre_value;
+  }
+}
+
+void SearchGrnnCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+
+  auto* x = param.x;
+  LoD offset_vec_vec = x->lod();
+  std::vector<int> offset(offset_vec_vec[offset_vec_vec.size() - 1].size());
+  for (size_t i = 0; i < offset_vec_vec[offset_vec_vec.size() - 1].size();
+       ++i) {
+    offset[i] = static_cast<int>(offset_vec_vec[offset_vec_vec.size() - 1][i]);
+  }
+  const float* x_data = x->data<float>();
+  auto* dout = param.out;
+  std::vector<int64_t> out_dims_vec{x->dims()[0], param.num_hidden};
+  dout->Resize(out_dims_vec);
+  float* dout_data = dout->mutable_data<float>(TARGET(kCUDA));
+  auto* wi = &_wi;
+  auto* wh = &_wh;
+
+  const float* weights_i2h = wi->data<float>();
+  const float* weights_h2h = wh->data<float>();
+
+  int batch_size = offset.size() - 1;
+  int seq_sum = x->dims()[0];
+  bool is_batched = offset.size() > 2;
+  int hidden_size = param.num_hidden;
+  int word_size = param.num_input;
+  int o_offset = 0;
+  int r_offset = 1;
+  int z_offset = 2;
+
+  is_batched = _seq_util.get_sorted_map(offset, stream);
+  std::vector<int> emit_offset_vec = _seq_util.get_emit_offset_vec();
+  int emit_length = emit_offset_vec.size() - 1;
+
+  if (is_batched) {
+    std::vector<int64_t> seq_shape{1, 1, seq_sum, word_size};
+    _temp_tensor_in.Resize(seq_shape);
+    std::vector<int64_t> seq_out_shape{1, 1, seq_sum, hidden_size};
+    _temp_tensor_out.Resize(seq_out_shape);
+    _seq_util.seq_2_sorted_seq(
+        x_data,
+        _temp_tensor_in.mutable_data<float>(TARGET(kCUDA)),
+        word_size,
+        stream);
+    x_data = _temp_tensor_in.data<float>();
+    dout_data = _temp_tensor_out.mutable_data<float>(TARGET(kCUDA));
+  }
+
+  std::vector<int64_t> shape_wx({seq_sum, 1, 3, hidden_size});
+  _temp_wx.Resize(shape_wx);
+
+  std::vector<int64_t> shape_wh({1, batch_size, 3, hidden_size});
+  _temp_wh.Resize(shape_wh);
+
+  gemm_impl_->init(false, false, seq_sum, 3 * hidden_size, word_size, &context);
+  gemm_impl_->run(1.0f,
+                  0.0f,
+                  x_data,
+                  weights_i2h,
+                  _temp_wx.mutable_data<float>(TARGET(kCUDA)),
+                  &context);
+
+  std::vector<int64_t> shape_zero({batch_size * hidden_size});
+  _temp_zero.Resize(shape_zero);
+
+  TargetWrapperCuda::MemsetAsync(_temp_zero.mutable_data<float>(TARGET(kCUDA)),
+                                 0,
+                                 sizeof(float) * batch_size * hidden_size,
+                                 stream);
+
+  const float* h = _temp_zero.data<float>();
+  for (int word_id = 0; word_id < emit_length; word_id++) {
+    int real_word_id = word_id;
+    int last_word_id = word_id - 1;
+    int emit_word_id_start = emit_offset_vec[real_word_id];
+    int emit_word_id_end = emit_offset_vec[real_word_id + 1];
+    int emit_word_length = emit_word_id_end - emit_word_id_start;
+
+    const float* hidden_in;
+    float* hidden_out = dout_data + emit_offset_vec[real_word_id] * hidden_size;
+
+    if (word_id == 0) {
+      hidden_in = h;
+    } else {
+      hidden_in = dout_data + emit_offset_vec[last_word_id] * hidden_size;
+    }
+
+    float* w_x_r = _temp_wx.mutable_data<float>(TARGET(kCUDA)) +
+                   r_offset * hidden_size +
+                   emit_word_id_start * hidden_size * 3;
+    float* w_x_z = _temp_wx.mutable_data<float>(TARGET(kCUDA)) +
+                   z_offset * hidden_size +
+                   emit_word_id_start * hidden_size * 3;
+    float* w_x_o = _temp_wx.mutable_data<float>(TARGET(kCUDA)) +
+                   o_offset * hidden_size +
+                   emit_word_id_start * hidden_size * 3;
+
+    float* w_h_r =
+        _temp_wh.mutable_data<float>(TARGET(kCUDA)) + r_offset * hidden_size;
+    float* w_h_z =
+        _temp_wh.mutable_data<float>(TARGET(kCUDA)) + z_offset * hidden_size;
+    float* w_h_o =
+        _temp_wh.mutable_data<float>(TARGET(kCUDA)) + o_offset * hidden_size;
+    gemm_impl_->init(
+        false, false, emit_word_length, 3 * hidden_size, hidden_size, &context);
+    gemm_impl_->run(1.0f,
+                    0.0f,
+                    hidden_in,
+                    _temp_weights_h2h.data<float>(),
+                    _temp_wh.mutable_data<float>(TARGET(kCUDA)),
+                    &context);
+
+    const float* w_o = weights_h2h;
+    const int block_dim = 512;
+    const int grid_dim =
+        (emit_word_length * hidden_size + block_dim - 1) / block_dim;
+    cal_cudnn_kernel<<<grid_dim, block_dim, 0, stream>>>(w_x_r,
+                                                         w_x_z,
+                                                         w_x_o,
+                                                         w_h_r,
+                                                         w_h_z,
+                                                         w_h_o,
+                                                         hidden_size,
+                                                         emit_word_length,
+                                                         hidden_out,
+                                                         hidden_in);
+  }
+
+  if (is_batched) {
+    _seq_util.sorted_seq_2_seq(_temp_tensor_out.data<float>(),
+                               dout->mutable_data<float>(TARGET(kCUDA)),
+                               hidden_size,
+                               stream);
+  }
+
+  dout->set_lod(x->lod());
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_grnn,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchGrnnCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Wi",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Wh",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("tmp_buffer",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("idx_sorted_by_width",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("layout_input",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_grnn_compute.h b/lite/kernels/cuda/search_grnn_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..42def3eb36d954a1f87d613a424085916a08b625
--- /dev/null
+++ b/lite/kernels/cuda/search_grnn_compute.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <vector>
+#include "lite/backends/cuda/blas.h"
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SeqSortedseqTranseUtil {
+ public:
+  explicit SeqSortedseqTranseUtil(bool is_reverse = false, bool is_bi = false)
+      : _is_reverse(is_reverse),
+        _is_bi(is_bi),
+        _dev_map_vec(nullptr),
+        _dev_map_vec_length(0) {}
+
+  ~SeqSortedseqTranseUtil() {
+    if (_dev_map_vec != nullptr) {
+      TargetWrapperCuda::Free(static_cast<void*>(_dev_map_vec));
+    }
+  }
+
+  std::vector<int>& get_length_index() { return _length_index; }
+  std::vector<int>& get_emit_offset_vec() { return _emit_offset_vec; }
+  std::vector<int>& get_map_vec() { return _map_vec; }
+  int* get_dev_map_vec() { return _dev_map_vec; }
+  int get_emit_length() { return _emit_length; }
+
+  template <typename Dtype>
+  void seq_2_sorted_seq(const Dtype* input,
+                        Dtype* output,
+                        int word_size,
+                        cudaStream_t stream);
+
+  template <typename Dtype>
+  void sorted_seq_2_seq(const Dtype* input,
+                        Dtype* output,
+                        int hidden_size,
+                        cudaStream_t stream);
+
+  bool get_sorted_map(const std::vector<int>& offset_vec,
+                      cudaStream_t stream_id);
+
+ private:
+  std::vector<int> _length_index;
+  std::vector<int> _emit_offset_vec;
+  std::vector<int> _map_vec;
+  int _emit_length;
+
+  bool _is_reverse;
+  bool _is_bi;
+  int* _dev_map_vec;
+  int _dev_map_vec_length;
+};
+
+class SearchGrnnCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SearchGrnnParam;
+  using TargetW = TargetWrapper<TARGET(kCUDA)>;
+
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~SearchGrnnCompute() = default;
+
+ private:
+  // Weights preprocess:
+  // wi need to be transpose, the axes should be (2, 0, 1)
+  // wh0 should transpose, {wh1 wh2} need be transpose, the axes should be {2,
+  // 0, 1}
+  void WeightsPreprocess();
+
+ private:
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
+
+  lite::Tensor _temp_tensor_in;
+  lite::Tensor _temp_tensor_out;
+  lite::Tensor _temp_wx;
+  lite::Tensor _temp_wh;
+  lite::Tensor _temp_zero;
+  lite::Tensor _temp_weights_h2h;
+
+  lite::Tensor _wi;
+  lite::Tensor _wh;
+
+  SeqSortedseqTranseUtil _seq_util;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_grnn_compute_test.cc b/lite/kernels/cuda/search_grnn_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08b96e1f1ecd57d10099b9566a5c0cd5e6e885d1
--- /dev/null
+++ b/lite/kernels/cuda/search_grnn_compute_test.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_grnn_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/api/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+
+TEST(search_grnn, normal) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  SearchGrnnCompute kernel;
+  operators::SearchGrnnParam param;
+
+  int num_input = 6;
+  int num_hidden = 6;
+  int num_batch = 3;
+  Tensor x, wi, wh, out, idx_sorted_by_width, layout_input, tmp_buffer;
+  x.Resize({num_batch, num_input});
+  wi.Resize({3, num_hidden, num_input});
+  wh.Resize({3, num_hidden, num_hidden});
+  LoD x_lod{};
+  x_lod.push_back({0, 1, 3});
+  x.set_lod(x_lod);
+
+  Tensor x_cpu, wi_cpu, wh_cpu, out_cpu, layout_input_cpu, tmp_buffer_cpu;
+  x_cpu.Resize({num_batch, num_input});
+  wi_cpu.Resize({3, num_hidden, num_input});
+  wh_cpu.Resize({3, num_hidden, num_hidden});
+  out_cpu.Resize({num_batch, num_hidden});
+  layout_input_cpu.Resize({num_batch, num_input});
+  tmp_buffer_cpu.Resize({20, num_batch, num_hidden});
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = static_cast<float>(i);
+  }
+  auto* wi_cpu_data = wi_cpu.mutable_data<float>();
+  for (int i = 0; i < wi_cpu.numel(); ++i) {
+    wi_cpu_data[i] = static_cast<float>(i);
+  }
+  auto* wh_cpu_data = wh_cpu.mutable_data<float>();
+  for (int i = 0; i < wh_cpu.numel(); ++i) {
+    wh_cpu_data[i] = static_cast<float>(i);
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  wi.Assign<float, lite::DDim, TARGET(kCUDA)>(wi_cpu_data, wi_cpu.dims());
+  wh.Assign<float, lite::DDim, TARGET(kCUDA)>(wh_cpu_data, wh_cpu.dims());
+
+  param.x = &x;
+  param.wi = &wi;
+  param.wh = &wh;
+  param.out = &out;
+  param.idx_sorted_by_width = &idx_sorted_by_width;
+  param.layout_input = &layout_input;
+  param.tmp_buffer = &tmp_buffer;
+  param.num_input = num_input;
+  param.num_hidden = num_hidden;
+  kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  kernel.SetContext(std::move(ctx));
+  kernel.Launch();
+  cudaDeviceSynchronize();
+
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  LOG(INFO) << "out_data:";
+  for (int i = 0; i < out.numel(); i++) {
+    // EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5);
+    LOG(INFO) << out_cpu_data[i];
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_group_padding_compute.cu b/lite/kernels/cuda/search_group_padding_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..697e53dbb68b09bec6c32ece73723d469a5cd9d6
--- /dev/null
+++ b/lite/kernels/cuda/search_group_padding_compute.cu
@@ -0,0 +1,164 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_group_padding_compute.h"
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+using Tensor = lite::Tensor;
+
+template <typename Dtype>
+__global__ void ker_search_group_padding(Dtype* out_emb_padding_data,
+                                         Dtype* out_padding_data,
+                                         const Dtype* in_data,
+                                         const uint64_t* offset,
+                                         const int seq_num,
+                                         const int max_len,
+                                         const int emb_size,
+                                         const Dtype pad_id,
+                                         const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int emb_id = tid % emb_size;
+    int word_id = tid / emb_size;
+    int seq_id = word_id / max_len;
+    int word_id_in_seq = word_id % max_len;
+    int cur_len = offset[seq_id + 1] - offset[seq_id];
+    if (word_id_in_seq < cur_len) {
+      out_emb_padding_data[tid] =
+          in_data[(offset[seq_id] + word_id_in_seq) * emb_size + emb_id];
+    } else {
+      out_emb_padding_data[tid] = 0.f;
+      if (emb_id == 0) {
+        out_padding_data[word_id] = pad_id;
+      }
+    }
+  }
+}
+
+void SearchGroupPaddingCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = ctx.exec_stream();
+
+  const Tensor* x = param.x;
+  Tensor* out_emb_padding = param.out_emb_padding;
+  Tensor* out_new = param.out_new;
+  Tensor* out_padding = param.out_padding;
+  const float pad_id = static_cast<float>(param.pad_id);
+  const float* in_data = x->data<float>();
+  const auto& in_seq_offset = x->lod()[0];
+  int batch = in_seq_offset.size() - 1;
+  int max_seq = 0;
+  for (int i = 0; i < batch; ++i) {
+    if (in_seq_offset[i + 1] - in_seq_offset[i] > max_seq) {
+      max_seq = in_seq_offset[i + 1] - in_seq_offset[i];
+    }
+  }
+  std::vector<size_t> new_offset;
+  new_offset.resize(batch + 1);
+  for (int i = 0; i < batch + 1; ++i) {
+    new_offset[i] = i * max_seq;
+  }
+  std::vector<int64_t> x_dims = x->dims().Vectorize();
+  LoD out_emb_padding_lod;
+  out_emb_padding_lod.push_back(new_offset);
+  out_emb_padding->set_lod(out_emb_padding_lod);
+  out_emb_padding->Resize({batch * max_seq, x_dims[1]});
+  float* out_emb_padding_data =
+      out_emb_padding->mutable_data<float>(TARGET(kCUDA));
+
+  LoD out_new_lod;
+  out_new_lod.push_back(in_seq_offset);
+  out_new->set_lod(out_new_lod);
+  out_new->Resize({x_dims[0], 1});
+  float* out_new_data = out_new->mutable_data<float>(TARGET(kCUDA));
+
+  LoD out_padding_lod;
+  out_padding_lod.push_back(new_offset);
+  out_padding->set_lod(out_padding_lod);
+  out_padding->Resize({batch * max_seq, 1});
+  float* out_padding_data = out_padding->mutable_data<float>(TARGET(kCUDA));
+
+  const int count = out_emb_padding->numel();
+  const auto& out_emb_padding_seq_offset = out_emb_padding->lod()[0];
+  int max_len = out_emb_padding_seq_offset[1];
+  int seq_num = out_emb_padding_seq_offset.size() - 1;
+  int emb_size = x->dims()[1];
+  _in_seq_offset.Resize({seq_num + 1, 1, 1, 1});
+  uint64_t* offset_data = _in_seq_offset.mutable_data<uint64_t>(TARGET(kCUDA));
+
+  TargetWrapperCuda::MemcpyAsync(offset_data,
+                                 in_seq_offset.data(),
+                                 sizeof(uint64_t) * in_seq_offset.size(),
+                                 IoDirection::HtoD,
+                                 cuda_stream);
+
+  TargetWrapperCuda::MemsetSync(
+      out_new_data, 0, out_new->dims()[0] * out_new->dims()[1] * sizeof(float));
+  TargetWrapperCuda::MemsetSync(
+      out_padding_data,
+      0,
+      out_padding->dims()[0] * out_padding->dims()[1] * sizeof(float));
+
+  ker_search_group_padding<
+      float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, cuda_stream>>>(
+      out_emb_padding_data,
+      out_padding_data,
+      in_data,
+      offset_data,
+      seq_num,
+      max_len,
+      emb_size,
+      pad_id,
+      count);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_group_padding,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchGroupPaddingCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out_emb_padding",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Out_new",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Out_padding",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_group_padding_compute.h b/lite/kernels/cuda/search_group_padding_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..88391e6d652b92571d11b321f12288155665d9da
--- /dev/null
+++ b/lite/kernels/cuda/search_group_padding_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SearchGroupPaddingCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchGroupPaddingParam;
+
+  void Run() override;
+  virtual ~SearchGroupPaddingCompute() = default;
+
+ private:
+  lite::Tensor _in_seq_offset;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_group_padding_compute_test.cc b/lite/kernels/cuda/search_group_padding_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b831780c876dcc9d910cbf48a66bf0d1ec7a5bb2
--- /dev/null
+++ b/lite/kernels/cuda/search_group_padding_compute_test.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_group_padding_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+TEST(search_group_padding_cuda, run_test) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  lite::Tensor x, x_cpu, x_ref;
+  lite::Tensor out_emb_padding, out_emb_padding_cpu, out_emb_padding_ref;
+  lite::Tensor out_new, out_new_cpu, out_new_ref;
+  lite::Tensor out_padding, out_padding_cpu, out_padding_ref;
+
+  int x_dims0 = 2;
+  int x_dims1 = 3;
+
+  x.Resize({x_dims0, x_dims1});
+  x_cpu.Resize({x_dims0, x_dims1});
+  x_ref.Resize({x_dims0, x_dims1});
+  out_emb_padding.Resize({1, x_dims1});
+  out_emb_padding_cpu.Resize({1, x_dims1});
+  out_emb_padding_ref.Resize({1, x_dims1});
+  out_new.Resize({x_dims0, 1});
+  out_new_cpu.Resize({x_dims0, 1});
+  out_new_ref.Resize({x_dims0, 1});
+  out_padding.Resize({1, 1});
+  out_padding_cpu.Resize({1, 1});
+  out_padding_ref.Resize({1, 1});
+
+  LoD x_lod{};
+  x_lod.push_back({0, 1});
+  x.set_lod(x_lod);
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* x_ref_data = x_ref.mutable_data<float>();
+  auto* out_emb_padding_data =
+      out_emb_padding.mutable_data<float>(TARGET(kCUDA));
+  auto* out_emb_padding_cpu_data = out_emb_padding_cpu.mutable_data<float>();
+  auto* out_emb_padding_ref_data = out_emb_padding_ref.mutable_data<float>();
+  auto* out_new_data = out_new.mutable_data<float>(TARGET(kCUDA));
+  auto* out_new_cpu_data = out_new_cpu.mutable_data<float>();
+  auto* out_new_ref_data = out_new_ref.mutable_data<float>();
+  auto* out_padding_data = out_padding.mutable_data<float>(TARGET(kCUDA));
+  auto* out_padding_cpu_data = out_padding_cpu.mutable_data<float>();
+  auto* out_padding_ref_data = out_padding_ref.mutable_data<float>();
+
+  for (int64_t i = 0; i < x_cpu.dims().production(); i++) {
+    x_cpu_data[i] = static_cast<float>(i);
+    x_ref_data[i] = static_cast<float>(i);
+  }
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  out_emb_padding_ref_data[0] = 0.f;
+  out_emb_padding_ref_data[1] = 1.f;
+  out_emb_padding_ref_data[2] = 2.f;
+  out_new_ref_data[0] = 0.f;
+  out_new_ref_data[1] = 0.f;
+  out_padding_ref_data[0] = 0.f;
+
+  SearchGroupPaddingCompute sgp_kernel;
+  operators::SearchGroupPaddingParam param;
+
+  param.x = &x;
+  param.out_emb_padding = &out_emb_padding;
+  param.out_new = &out_new;
+  param.out_padding = &out_padding;
+
+  sgp_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  sgp_kernel.SetContext(std::move(ctx));
+  sgp_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(out_emb_padding_cpu_data,
+                          out_emb_padding_data,
+                          sizeof(float) * out_emb_padding.numel(),
+                          IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(out_new_cpu_data,
+                          out_new_data,
+                          sizeof(float) * out_new.numel(),
+                          IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(out_padding_cpu_data,
+                          out_padding_data,
+                          sizeof(float) * out_padding.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_emb_padding_cpu.dims().production(); i++) {
+    EXPECT_NEAR(out_emb_padding_cpu_data[i], out_emb_padding_ref_data[i], 1e-5);
+  }
+  for (int i = 0; i < out_new_cpu.dims().production(); i++) {
+    EXPECT_NEAR(out_new_cpu_data[i], out_new_ref_data[i], 1e-5);
+  }
+  for (int i = 0; i < out_padding_cpu.dims().production(); i++) {
+    EXPECT_NEAR(out_padding_cpu_data[i], out_padding_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_group_padding, kCUDA, kFloat, kNCHW, def);
diff --git a/lite/kernels/cuda/search_seq_depadding_compute.cu b/lite/kernels/cuda/search_seq_depadding_compute.cu
index 179041596b04516ad5f4f9499c623b152aa1606d..ecadceab582ccebf765ef43edda49ed414354611 100644
--- a/lite/kernels/cuda/search_seq_depadding_compute.cu
+++ b/lite/kernels/cuda/search_seq_depadding_compute.cu
@@ -50,6 +50,7 @@ void SearchSeqDepaddingCompute::Run() {
   auto* out = param.out;
 
   auto* in_data = pad->data<float>();
+  out->Resize({src->dims()[0], pad->dims()[1]});
   auto* out_data = out->mutable_data<float>(TARGET(kCUDA));
   const int count = out->numel();
 
@@ -59,6 +60,9 @@ void SearchSeqDepaddingCompute::Run() {
   int seq_num = pad_seq_offset.size() - 1;
   int emb_size = pad->dims()[1];
 
+  LoD out_lod;
+  out_lod.push_back(src_seq_offset);
+  out->set_lod(out_lod);
   std::vector<int> seq_id_map;
   for (int i = 0; i < seq_num; i++) {
     int cur_len = src_seq_offset[i + 1] - src_seq_offset[i];
@@ -77,11 +81,12 @@ void SearchSeqDepaddingCompute::Run() {
                        cuda_stream);
 
   int threads = 512;
-  ker_sequence_depadding_fwd<<<count, threads, 0, cuda_stream>>>(
+  int blocks = (count + threads - 1) / threads;
+  ker_sequence_depadding_fwd<<<blocks, threads, 0, cuda_stream>>>(
       out_data, in_data, seq_id_map_data, seq_num, max_len, emb_size, count);
 
   cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/search_seq_fc_compute_test.cc b/lite/kernels/cuda/search_seq_fc_compute_test.cc
index 0b9beb7b09290e81f17ff2580ff68f4592c9b132..354d1bb5bc3b0f3ee4d102fb2ebce176041ba91b 100644
--- a/lite/kernels/cuda/search_seq_fc_compute_test.cc
+++ b/lite/kernels/cuda/search_seq_fc_compute_test.cc
@@ -49,7 +49,6 @@ void search_seq_fc_compute_ref(const operators::SearchSeqFcParam& param) {
   auto w_data = w->data<T>();
   auto out_data = out->mutable_data<T>();
 
-#pragma omp parallel for
   for (int i = 0; i < M; i++) {
     for (int j = 0; j < N; j++) {
       auto sum = static_cast<T>(0);
diff --git a/lite/kernels/cuda/sequence_arithmetic_compute.cu b/lite/kernels/cuda/sequence_arithmetic_compute.cu
index 5ca12267f983c85f7a71866bd6d761994a8efe81..7593632a14acd0cbec548dc5b9d3a096c4c7f38d 100644
--- a/lite/kernels/cuda/sequence_arithmetic_compute.cu
+++ b/lite/kernels/cuda/sequence_arithmetic_compute.cu
@@ -120,7 +120,7 @@ void SequenceArithmeticCompute::Run() {
 
   auto x_data = param.X->data<float>();
   auto x_lod = param.X->lod()[0];
-  auto y_data = param.X->data<float>();
+  auto y_data = param.Y->data<float>();
   auto y_lod = param.Y->lod()[0];
   auto out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
 
@@ -174,7 +174,6 @@ void SequenceArithmeticCompute::Run() {
   int seq_num = x_lod.size() - 1;
   int count = param.X->numel();
   int inner_size = param.X->dims()[1];
-
   switch (param.op_type) {
     case 1:  // sum
       ker_arithmetic_sum<
diff --git a/lite/kernels/cuda/sequence_concat_compute.cu b/lite/kernels/cuda/sequence_concat_compute.cu
index 3488c829ce2469e4193601145c0e1eb459bfb1a4..57d6684ffdbca380d373ca086f51b12c5b8b35dc 100644
--- a/lite/kernels/cuda/sequence_concat_compute.cu
+++ b/lite/kernels/cuda/sequence_concat_compute.cu
@@ -22,25 +22,44 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-const int CUDA_NUM_THREADS = 512;
+template <typename dtype>
+__global__ void concat_impl_cuda(const int nthreads,
+                                 const dtype* in_data,
+                                 const int num_concats,
+                                 const int concat_size,
+                                 const int top_concat_axis,
+                                 const int bottom_concat_axis,
+                                 const int offset_concat_axis,
+                                 dtype* out_data) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    const int total_concat_size = concat_size * bottom_concat_axis;
+    const int concat_num = index / total_concat_size;
+    const int concat_index = index % total_concat_size;
+    const int top_index =
+        concat_index +
+        (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+    out_data[top_index] = in_data[index];
+  }
+}
+
+template <typename dtype>
+__global__ void concat_impl_2d_impl(const int inner_size,
+                                    const int num_concats,
+                                    const dtype* in_data,
+                                    const int concat_size,
+                                    const int out_concat_axis,
+                                    const int offset_concat_axis,
+                                    dtype* out_data) {
+  int idx_inner = threadIdx.x + blockIdx.x * blockDim.x;
+  int idx_outer = threadIdx.y + blockIdx.y * blockDim.y;
 
-template <typename Dtype>
-__global__ void ker_sequence_concat(Dtype* out_data,
-                                    const uint64_t* in_locate_data,
-                                    const int* o2i_map,
-                                    const int* o2i_w_map,
-                                    const int seq_num,
-                                    const int emb_size,
-                                    const int count) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int tid = idx; tid < count; tid += blockDim.x * gridDim.x) {
-    int emb_id = tid % emb_size;
-    int word_id = tid / emb_size;
-    int input_id = o2i_map[word_id];
-    int cur_work_id = o2i_w_map[word_id];
-    const Dtype* in_data = reinterpret_cast<const Dtype*>(
-        reinterpret_cast<uintptr_t>(in_locate_data[input_id]));
-    out_data[tid] = in_data[cur_work_id * emb_size + emb_id];
+  if (idx_inner < inner_size && idx_outer < num_concats) {
+    int idx_input = idx_outer * inner_size + idx_inner;
+    int idx_output =
+        (idx_outer * out_concat_axis + offset_concat_axis) * concat_size +
+        idx_inner;
+    out_data[idx_output] = in_data[idx_input];
   }
 }
 
@@ -48,71 +67,75 @@ void SequenceConcatCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
   auto stream = ctx.exec_stream();
-  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
 
-  int seq_num = param.X[0]->lod()[0].size() - 1;
-  const int emb_size = param.X[0]->numel() / param.X[0]->dims()[0];
-  std::vector<uint64_t> in_locate_vec;
-  for (size_t i = 0; i < param.X.size(); ++i) {
-    in_locate_vec.push_back(
-        reinterpret_cast<uintptr_t>(param.X[i]->data<float>()));
+  const int BLOCK_SIZE = 32;
+  const int axis = 1;
+  int num_concats = param.X[0]->dims().count(0, axis);
+  int concat_input_size =
+      param.X[0]->dims().count(axis + 1, param.X[0]->dims().size());
+
+  int input_size = param.X.size();
+  std::vector<std::vector<int64_t>> shapes_in(input_size);
+  for (int i = 0; i < input_size; ++i) {
+    shapes_in[i] = param.X[i]->dims().Vectorize();
   }
-  in_locate_tensor.Resize({static_cast<int64_t>(in_locate_vec.size())});
+  std::vector<int64_t> shape_out = shapes_in[0];
 
-  std::vector<int> out2in_map;
-  std::vector<int> out2in_word_map;
-  for (int i = 0; i < seq_num; ++i) {
-    for (int j = 0; j < param.X.size(); ++j) {
-      auto offset = param.X[j]->lod()[0];
-      int cur_len = offset[i + 1] - offset[i];
-      for (int k = 0; k < cur_len; ++k) {
-        out2in_map.push_back(j);
-        out2in_word_map.push_back(offset[i] + k);
+  // compute output shape
+  for (int i = 1; i < input_size; ++i) {
+    for (int j = 0; j < shapes_in[i].size(); ++j) {
+      if (j == axis) {
+        continue;
+      } else if (shapes_in[i][j] != -1) {
+        CHECK_EQ(shape_out[j], shapes_in[i][j])
+            << "All inputs must have the same shape, except at concat_axis.";
       }
     }
+    shape_out[axis] += shapes_in[i][axis];
   }
-  int word_num = out2in_map.size();
-  out2in_map_tensor.Resize({word_num});
-  out2in_word_map_tensor.Resize({word_num});
-  int* gpu_o2i_map_data = out2in_map_tensor.mutable_data<int>(TARGET(kCUDA));
-  int* gpu_o2i_w_map_data =
-      out2in_word_map_tensor.mutable_data<int>(TARGET(kCUDA));
-  uint64_t* gpu_in_locate_data =
-      in_locate_tensor.mutable_data<uint64_t>(TARGET(kCUDA));
 
-  TargetWrapperCuda::MemcpyAsync(gpu_o2i_map_data,
-                                 out2in_map.data(),
-                                 sizeof(int) * out2in_map.size(),
-                                 IoDirection::HtoD,
-                                 stream);
-  TargetWrapperCuda::MemcpyAsync(gpu_o2i_w_map_data,
-                                 out2in_word_map.data(),
-                                 sizeof(int) * out2in_word_map.size(),
-                                 IoDirection::HtoD,
-                                 stream);
-  TargetWrapperCuda::MemcpyAsync(gpu_in_locate_data,
-                                 in_locate_vec.data(),
-                                 sizeof(uint64_t) * in_locate_vec.size(),
-                                 IoDirection::HtoD,
-                                 stream);
+  param.Out->Resize(shape_out);
+  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
+  int offset_concat_axis = 0;
+  const int out_concat_axis = shape_out[axis];
 
-  int count = param.X[0]->numel();
-  for (int i = 1; i < param.X.size(); ++i) {
-    count += param.X[i]->numel();
+  for (int i = 0; i < input_size; ++i) {
+    std::vector<int64_t> in_shape = param.X[i]->dims().Vectorize();
+    const auto* in_data = param.X[i]->data<float>();
+    const int in_concat_axis = in_shape[axis];
+    const int in_concat_size = in_concat_axis * concat_input_size;
+    const int nthreads = in_concat_size * num_concats;
+    float ratio = static_cast<float>(in_concat_size) / num_concats;
+    bool is_balance = (ratio > 0.1 && ratio < 10);
+    if (is_balance) {
+      int block_x = BLOCK_SIZE;
+      int block_y = BLOCK_SIZE;
+      int grid_x = (in_concat_size + block_x - 1) / block_x;
+      int grid_y = (num_concats + block_y - 1) / block_y;
+      dim3 block(block_x, block_y);
+      dim3 grid(grid_x, grid_y);
+      concat_impl_2d_impl<float><<<grid, block, 0, stream>>>(in_concat_size,
+                                                             num_concats,
+                                                             in_data,
+                                                             concat_input_size,
+                                                             out_concat_axis,
+                                                             offset_concat_axis,
+                                                             out_data);
+    } else {
+      int grid = (nthreads + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      concat_impl_cuda<float><<<grid, BLOCK_SIZE, 0, stream>>>(
+          nthreads,
+          in_data,
+          num_concats,
+          concat_input_size,
+          out_concat_axis,
+          in_concat_axis,
+          offset_concat_axis,
+          out_data);
+    }
+    offset_concat_axis += in_concat_axis;
   }
-
-  int blocks = (count + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-  ker_sequence_concat<float><<<blocks, CUDA_NUM_THREADS, 0, stream>>>(
-      out_data,
-      gpu_in_locate_data,
-      gpu_o2i_map_data,
-      gpu_o2i_w_map_data,
-      seq_num,
-      emb_size,
-      count);
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+  param.Out->set_lod(param.X[0]->lod());
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/sequence_concat_compute.h b/lite/kernels/cuda/sequence_concat_compute.h
index 1737c18dd35976572efa1b62fadefed906b0ceb5..3f2204cd1bfd27af61b791b4f2c7b3e2ed210436 100644
--- a/lite/kernels/cuda/sequence_concat_compute.h
+++ b/lite/kernels/cuda/sequence_concat_compute.h
@@ -27,11 +27,6 @@ class SequenceConcatCompute
 
   void Run() override;
   virtual ~SequenceConcatCompute() = default;
-
- private:
-  lite::Tensor out2in_map_tensor;
-  lite::Tensor out2in_word_map_tensor;
-  lite::Tensor in_locate_tensor;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/sequence_pool_compute.cu b/lite/kernels/cuda/sequence_pool_compute.cu
index 0b0376859f3a8b2eea2c5ff6af4dd77e94dc362d..97876ec32fcc3ffc3d45ff8dbeafca90d6191b23 100644
--- a/lite/kernels/cuda/sequence_pool_compute.cu
+++ b/lite/kernels/cuda/sequence_pool_compute.cu
@@ -13,28 +13,20 @@
 // limitations under the License.
 
 #include <vector>
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/kernels/cuda/sequence_pool_compute.h"
 
-const int CUDA_NUM_THREADS = 512;
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-/// CUDA: number of blocks for threads.
-inline int CUDA_GET_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-inline int CUDA_GET_BLOCKS(const int N, const int base) {
-  return (N + base - 1) / base;
-}
-
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace cuda {
 
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
 template <typename Dtype>
 __global__ void seq_pool_average_kernel(Dtype* dst,
                                         const Dtype* src_in,
@@ -262,4 +254,5 @@ REGISTER_LITE_KERNEL(sequence_pool,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
diff --git a/lite/kernels/cuda/sequence_pool_concat_compute.cu b/lite/kernels/cuda/sequence_pool_concat_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..46724adc1a57f76a462b7d125ccacea99c32706d
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pool_concat_compute.cu
@@ -0,0 +1,266 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_pool_concat_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename Dtype>
+__global__ void sequence_pool_concat(const uint64_t* input_locate_data,
+                                     const int* pool_type_list,
+                                     Dtype* output_data,
+                                     const int* offset,
+                                     int batch,
+                                     int in_num,
+                                     int in_dim) {
+  int tid = threadIdx.x + blockDim.x * blockIdx.x;
+  int em_id = tid % in_dim;
+  int in_id = (tid / in_dim) % in_num;
+  int seq_id = tid / (in_dim * in_num);
+
+  if (seq_id >= batch) {
+    return;
+  }
+  Dtype* out_data = output_data + tid;
+  int offset_id = in_id * (batch + 1) + seq_id;
+  if (pool_type_list[in_id] == 4) {  // last
+    const Dtype* in_data =
+        reinterpret_cast<const Dtype*>(
+            reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
+        em_id;
+    output_data[tid] = in_data[(offset[offset_id + 1] - 1) * in_dim];
+  } else if (pool_type_list[in_id] == 6) {  // max
+    const Dtype* in_data =
+        reinterpret_cast<const Dtype*>(
+            reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
+        em_id + offset[offset_id] * in_dim;
+    Dtype max = in_data[0];
+    for (int i = 1; i < offset[offset_id + 1] - offset[offset_id]; i++) {
+      Dtype cur_data = in_data[i * in_dim];
+      max = cur_data > max ? cur_data : max;
+    }
+    output_data[tid] = max;
+  } else {
+    return;
+  }
+}
+
+template <typename Dtype>
+__global__ void sequence_pool_concat(const uint64_t* input_locate_data,
+                                     const int* pool_type_list,
+                                     Dtype* output_data,
+                                     const int* offset,
+                                     int batch,
+                                     int in_num,
+                                     const int* out_offset,
+                                     const int* out_id_seq_map_data,
+                                     int out_dim) {
+  int tid = threadIdx.x + blockDim.x * blockIdx.x;
+  int em_id = tid % out_dim;
+  int seq_id = tid / out_dim;
+  int in_id = out_id_seq_map_data[em_id];
+  em_id = em_id - out_offset[in_id];
+  int in_dim = out_offset[in_id + 1] - out_offset[in_id];
+
+  if (seq_id >= batch) {
+    return;
+  }
+  Dtype* out_data = output_data + tid;
+  int offset_id = in_id * (batch + 1) + seq_id;
+  if (pool_type_list[in_id] == 4) {  // last
+    const Dtype* in_data =
+        reinterpret_cast<const Dtype*>(
+            reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
+        em_id;
+    output_data[tid] = in_data[(offset[offset_id + 1] - 1) * in_dim];
+  } else if (pool_type_list[in_id] == 6) {  // max
+    const Dtype* in_data =
+        reinterpret_cast<const Dtype*>(
+            reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
+        em_id + offset[offset_id] * in_dim;
+    Dtype max = in_data[0];
+    for (int i = 1; i < offset[offset_id + 1] - offset[offset_id]; i++) {
+      Dtype cur_data = in_data[i * in_dim];
+      max = cur_data > max ? cur_data : max;
+    }
+    output_data[tid] = max;
+  } else {
+    return;
+  }
+}
+
+void SequencePoolConcatCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  int in_num = param.X.size();
+  std::vector<int64_t> shape({in_num, 1, 1, 1});
+  _in_offset_tensor.Resize(shape);
+  _in_ptr_tensor.Resize(shape);
+  _in_pool_type_tensor.Resize(shape);
+  int* in_pool_type_data =
+      _in_pool_type_tensor.mutable_data<int>(TARGET(kCUDA));
+  std::vector<int> pool_type_list;
+  for (auto type : param.pool_type) {
+    if (type == "AVERAGE") {
+      pool_type_list.push_back(1);
+    } else if (type == "SUM") {
+      pool_type_list.push_back(2);
+    } else if (type == "SQRT") {
+      pool_type_list.push_back(3);
+    } else if (type == "LAST") {
+      pool_type_list.push_back(4);
+    } else if (type == "FIRST") {
+      pool_type_list.push_back(5);
+    } else if (type == "MAX") {
+      pool_type_list.push_back(6);
+    } else {
+      LOG(ERROR) << "pool type " << type << " is not supoorted.";
+    }
+  }
+  _is_in_same_len = true;
+  int in_len = param.X[0]->dims().count(1, param.X[0]->dims().size());
+  std::vector<int> out_id_seq_map_list;
+  std::vector<int> out_offset_list;
+  int total_len = 0;
+  out_offset_list.push_back(total_len);
+  for (int i = 0; i < in_num; ++i) {
+    int cur_len = param.X[i]->dims().count(1, param.X[i]->dims().size());
+    _is_in_same_len = _is_in_same_len && in_len == cur_len;
+    for (int k = 0; k < cur_len; ++k) {
+      out_id_seq_map_list.push_back(i);
+    }
+    total_len += cur_len;
+    out_offset_list.push_back(total_len);
+  }
+  std::vector<int64_t> out_id_seq_map_shape({total_len, 1, 1, 1});
+  std::vector<int64_t> out_offset_shape({in_num + 1, 1, 1, 1});
+  _out_offset_tensor.Resize(out_offset_shape);
+  _out_id_seq_map_tensor.Resize(out_id_seq_map_shape);
+  int* out_offset_data = _out_offset_tensor.mutable_data<int>(TARGET(kCUDA));
+  int* out_id_seq_map_data =
+      _out_id_seq_map_tensor.mutable_data<int>(TARGET(kCUDA));
+
+  TargetWrapperCuda::MemcpyAsync(in_pool_type_data,
+                                 &pool_type_list[0],
+                                 sizeof(int) * param.X.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  TargetWrapperCuda::MemcpyAsync(out_offset_data,
+                                 &out_offset_list[0],
+                                 sizeof(int) * out_offset_list.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  TargetWrapperCuda::MemcpyAsync(out_id_seq_map_data,
+                                 &out_id_seq_map_list[0],
+                                 sizeof(int) * out_id_seq_map_list.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  cudaStreamSynchronize(stream);
+}
+
+void SequencePoolConcatCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  auto& inputs = param.X;
+
+  auto offset = inputs[0]->lod()[0];
+  int batch = offset.size() - 1;
+  CHECK_GE(offset.size(), 1);
+  std::vector<int> all_offset;
+  for (int i = 0; i < inputs.size(); ++i) {
+    auto it = all_offset.end();
+    auto cur_offset = inputs[i]->lod()[0];
+    all_offset.insert(it, cur_offset.begin(), cur_offset.end());
+  }
+  int total_size = all_offset.size();
+  std::vector<int64_t> offset_shape({total_size, 1, 1, 1});
+  _in_offset_tensor.Resize(offset_shape);
+  int* offset_data = _in_offset_tensor.mutable_data<int>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(offset_data,
+                                 &all_offset[0],
+                                 sizeof(int) * all_offset.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  std::vector<uint64_t> in_locate_vec;
+  for (int i = 0; i < inputs.size(); ++i) {
+    in_locate_vec.push_back(
+        reinterpret_cast<uintptr_t>(inputs[i]->data<float>()));
+  }
+  uint64_t* in_locate_data =
+      _in_ptr_tensor.mutable_data<uint64_t>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(in_locate_data,
+                                 &in_locate_vec[0],
+                                 sizeof(uint64_t) * inputs.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  const int* in_pool_type_data = _in_pool_type_tensor.data<int>();
+  const int* out_id_seq_map_data = _out_id_seq_map_tensor.data<int>();
+  const int* out_offset_data = _out_offset_tensor.data<int>();
+  int count = param.Out->numel();
+
+  int in_dim = inputs[0]->numel() / inputs[0]->dims()[0];
+  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
+  int in_num = inputs.size();
+  if (_is_in_same_len) {
+    sequence_pool_concat<
+        float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+        in_locate_data,
+        in_pool_type_data,
+        out_data,
+        offset_data,
+        batch,
+        in_num,
+        in_dim);
+  } else {
+    int out_dim = param.Out->numel() / param.Out->dims()[0];
+    sequence_pool_concat<
+        float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+        in_locate_data,
+        in_pool_type_data,
+        out_data,
+        offset_data,
+        batch,
+        in_num,
+        out_offset_data,
+        out_id_seq_map_data,
+        out_dim);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_pool_concat,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SequencePoolConcatCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_pool_concat_compute.h b/lite/kernels/cuda/sequence_pool_concat_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f1d1ef76b472f6914475e45893d42a90228a147
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pool_concat_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequencePoolConcatCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequencePoolConcatParam;
+
+  void Run() override;
+  void PrepareForRun() override;
+  virtual ~SequencePoolConcatCompute() = default;
+
+ private:
+  lite::Tensor _in_offset_tensor;
+  lite::Tensor _in_ptr_tensor;
+  lite::Tensor _in_pool_type_tensor;
+  lite::Tensor _out_offset_tensor;
+  lite::Tensor _out_id_seq_map_tensor;
+  bool _is_in_same_len;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_reverse_compute.cu b/lite/kernels/cuda/sequence_reverse_compute.cu
index ee2550cd96c72dabbcff888e7a9433bd9d12490a..68447fcebb1a6189f3a80d47ea29b0fca88267c8 100644
--- a/lite/kernels/cuda/sequence_reverse_compute.cu
+++ b/lite/kernels/cuda/sequence_reverse_compute.cu
@@ -42,11 +42,9 @@ __host__ __device__ inline size_t UpperBound(const T* x,
   return static_cast<size_t>(first - x);
 }
 
-__global__ void SequenceReverseKernelGridIsOne(const float* x,
-                                               float* y,
-                                               const int64_t* lod,
-                                               size_t lod_count,
-                                               int64_t row_numel) {
+template <typename T>
+__global__ void SequenceReverseKernelGridIsOne(
+    const T* x, T* y, const int64_t* lod, size_t lod_count, int64_t row_numel) {
   int64_t idx = static_cast<int64_t>(threadIdx.x);
   auto row_idx_x = idx / row_numel;
   auto lod_idx = UpperBound(lod, lod_count, row_idx_x);
@@ -55,8 +53,9 @@ __global__ void SequenceReverseKernelGridIsOne(const float* x,
   y[idx_y] = x[idx];
 }
 
-__global__ void SequenceReverseKernel(const float* x,
-                                      float* y,
+template <typename T>
+__global__ void SequenceReverseKernel(const T* x,
+                                      T* y,
                                       const int64_t* lod,
                                       size_t lod_count,
                                       int64_t row_numel,
@@ -71,19 +70,20 @@ __global__ void SequenceReverseKernel(const float* x,
   }
 }
 
-void SequenceReverseCompute::Run() {
-  auto& param = this->Param<param_t>();
+template <typename T, PrecisionType Ptype>
+void SequenceReverseCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
   auto stream = ctx.exec_stream();
-
   size_t limit = static_cast<size_t>(param.X->numel());
   int64_t row_numel = static_cast<int64_t>(limit / param.X->dims()[0]);
-  const auto* x_data = param.X->data<float>();
-  auto y_data = param.Out->mutable_data<float>(TARGET(kCUDA));
+  const auto* x_data = param.X->template data<T>();
+  auto y_data = param.Out->template mutable_data<T>(TARGET(kCUDA));
   CHECK_NE(x_data, y_data)
       << "SequenceReverse Op does not support in-place operation";
   const auto lod = param.X->lod()[param.X->lod().size() - 1];
   const size_t lod_count = lod.size();
+  param.Out->set_lod(param.X->lod());
 
   lod_cuda.Resize({static_cast<int64_t>(lod.size())});
   int64_t* lod_data = lod_cuda.mutable_data<int64_t>(TARGET(kCUDA));
@@ -92,11 +92,9 @@ void SequenceReverseCompute::Run() {
                                  sizeof(int64_t) * lod.size(),
                                  IoDirection::HtoD,
                                  stream);
-
   constexpr int num_threads = 1024;
   int block_size = limit <= num_threads ? limit : num_threads;
   int grid_size = (limit + num_threads - 1) / num_threads;
-
   if (grid_size == 1) {
     SequenceReverseKernelGridIsOne<<<1, block_size, 0, stream>>>(
         x_data, y_data, lod_data, lod_count, row_numel);
@@ -104,7 +102,6 @@ void SequenceReverseCompute::Run() {
     SequenceReverseKernel<<<grid_size, block_size, 0, stream>>>(
         x_data, y_data, lod_data, lod_count, row_numel, limit);
   }
-
   cudaError_t error = cudaGetLastError();
   if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
@@ -114,12 +111,20 @@ void SequenceReverseCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(sequence_reverse,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::SequenceReverseCompute,
-                     def)
+typedef paddle::lite::kernels::cuda::SequenceReverseCompute<float,
+                                                            PRECISION(kFloat)>
+    ReverseFp32;
+
+typedef paddle::lite::kernels::cuda::SequenceReverseCompute<int64_t,
+                                                            PRECISION(kInt64)>
+    ReverseInt64;
+
+REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kFloat, kNCHW, ReverseFp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kInt64, kNCHW, ReverseInt64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_reverse_compute.h b/lite/kernels/cuda/sequence_reverse_compute.h
index ba85f0856383632c4e3dd15b9d52b57a66c824a7..6b6199e020e64343632d3f7c90d2cbbae4eaa42b 100644
--- a/lite/kernels/cuda/sequence_reverse_compute.h
+++ b/lite/kernels/cuda/sequence_reverse_compute.h
@@ -20,8 +20,8 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-class SequenceReverseCompute
-    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+template <typename T, PrecisionType Ptype>
+class SequenceReverseCompute : public KernelLite<TARGET(kCUDA), Ptype> {
  public:
   using param_t = operators::SequenceReverseParam;
 
diff --git a/lite/kernels/cuda/sequence_reverse_compute_test.cc b/lite/kernels/cuda/sequence_reverse_compute_test.cc
index 3659f0d12cba203e3b7273aa0556e011613004d7..3317b523037d913d6017041fbd357ed1dcf2d20a 100644
--- a/lite/kernels/cuda/sequence_reverse_compute_test.cc
+++ b/lite/kernels/cuda/sequence_reverse_compute_test.cc
@@ -40,7 +40,7 @@ static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) {
 }
 
 TEST(sequence_reverse_cuda, normal) {
-  SequenceReverseCompute seq_kernel;
+  SequenceReverseCompute<float, PRECISION(kFloat)> seq_kernel;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   auto& context = ctx->As<CUDAContext>();
 
diff --git a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25ea6b2ea9dcfc5cbba8f001a360f51a64de0557
--- /dev/null
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
@@ -0,0 +1,332 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <limits>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/sequence_topk_avg_pooling_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename Dtype>
+__global__ void topk_avg_pooling_kernel_by_row_improve(
+    Dtype *output_data,
+    const Dtype *input,
+    const int *gpu_input_offset_l,
+    const int *gpu_input_offset_r,
+    const int row_max,
+    const int col_max,
+    const int topk_size,
+    const int *topks,
+    const int feat_map_num) {
+  int row =
+      gpu_input_offset_l[blockIdx.x + 1] - gpu_input_offset_l[blockIdx.x];  // 8
+  int col = gpu_input_offset_r[blockIdx.x + 1] -
+            gpu_input_offset_r[blockIdx.x];  // 30
+
+  int max_k = topks[topk_size - 1];
+  max_k = max_k < col ? max_k : col;
+
+  extern __shared__ Dtype smem[];  // H*W
+
+  const Dtype *fm_row_in_data = input +
+                                blockIdx.x * row_max * feat_map_num * col_max +
+                                blockIdx.y * row_max * col_max;
+
+  for (int i = threadIdx.x; i < row * col_max; i += blockDim.x) {
+    smem[i] = fm_row_in_data[i];
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < row; idx += blockDim.x) {
+    Dtype *fm_row_out_data =
+        output_data +
+        (gpu_input_offset_l[blockIdx.x] + idx) * feat_map_num * topk_size +
+        blockIdx.y * topk_size;
+
+    Dtype *smem_start_col = smem + idx * col_max;
+
+    int counter = max_k;  // topk_size;
+    Dtype last_max_val = -20000.0;
+    while (counter) {
+      Dtype max_val = -10000.0;
+      int max_pos = 0;  // -1;
+      int m = 0;
+      for (; m < col; m++) {
+        Dtype cur_data = smem_start_col[m];
+        if (cur_data > max_val) {
+          max_val = cur_data;
+          max_pos = m;
+          last_max_val = max_val;
+        }
+      }
+      if (max_val < -9999.0) {  // == -10000.0
+        max_val = last_max_val;
+      }
+      smem_start_col[max_pos] = -10000000.0;
+
+      int i = max_k - counter;
+      for (int c = 0; c < topk_size; c++) {
+        if (i <= topks[c] - 1) {
+          fm_row_out_data[c] += max_val;
+        }
+      }
+      counter--;
+    }
+    __syncthreads();
+    // compute avg
+    for (int i = 0; i < topk_size; i++) {
+      fm_row_out_data[i] = fm_row_out_data[i] / topks[i];
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void topk_avg_pooling_kernel_for_big_data(
+    Dtype *output_data,
+    const Dtype *input_data,
+    const int *gpu_input_offset_l,
+    const int *gpu_input_offset_r,
+    const int row_max,
+    const int col_max,
+    const int topk_size,
+    const int *topks,
+    const int feat_map_num,
+    const int actual_row_in_shared_mem) {
+  int row = gpu_input_offset_l[blockIdx.x + 1] -
+            gpu_input_offset_l[blockIdx.x];  // 75
+  int col = gpu_input_offset_r[blockIdx.x + 1] -
+            gpu_input_offset_r[blockIdx.x];  // 300
+
+  int max_k = topks[topk_size - 1];
+  max_k = max_k < col ? max_k : col;
+
+  extern __shared__ Dtype smem[];  // H1*W or H2*W ...
+
+  int filled_z = row / actual_row_in_shared_mem;
+  int remain_row = row - filled_z * actual_row_in_shared_mem;
+
+  if (blockIdx.z > filled_z || (blockIdx.z == filled_z && remain_row == 0)) {
+    return;
+  }
+
+  const Dtype *fm_row_in_data = input_data +
+                                blockIdx.x * row_max * feat_map_num * col_max +
+                                blockIdx.y * row_max * col_max +
+                                blockIdx.z * actual_row_in_shared_mem * col_max;
+  if (blockIdx.z == filled_z) {
+    for (int i = threadIdx.x; i < remain_row * col_max; i += blockDim.x) {
+      smem[i] = fm_row_in_data[i];
+    }
+  } else {
+    for (int i = threadIdx.x; i < actual_row_in_shared_mem * col_max;
+         i += blockDim.x) {
+      smem[i] = fm_row_in_data[i];
+    }
+  }
+  __syncthreads();
+
+  int cur_row;
+  if (blockIdx.z == filled_z) {
+    cur_row = remain_row;
+  } else {
+    cur_row = actual_row_in_shared_mem;
+  }
+
+  for (int idx = threadIdx.x; idx < cur_row; idx += blockDim.x) {
+    Dtype *fm_row_out_data = output_data +
+                             (gpu_input_offset_l[blockIdx.x] +
+                              blockIdx.z * actual_row_in_shared_mem + idx) *
+                                 feat_map_num * topk_size +
+                             blockIdx.y * topk_size;
+
+    Dtype *smem_start_col = smem + idx * col_max;
+
+    int counter = max_k;  // topk_size;
+    Dtype last_max_val = -20000.0;
+    while (counter) {
+      Dtype max_val = -10000.0;
+      int max_pos = 0;  // -1;
+      int m = 0;
+      for (; m < col; m++) {
+        Dtype cur_data = smem_start_col[m];
+        if (cur_data > max_val) {
+          max_val = cur_data;
+          max_pos = m;
+          last_max_val = max_val;
+        }
+      }
+      if (max_val < -9999.0) {  // == -10000.0
+        max_val = last_max_val;
+      }
+      smem_start_col[max_pos] = -10000000.0;
+
+      int i = max_k - counter;
+      for (int c = 0; c < topk_size; c++) {
+        if (i <= topks[c] - 1) {
+          fm_row_out_data[c] += max_val;
+        }
+      }
+      counter--;
+    }
+    __syncthreads();
+    // compute avg
+    for (int i = 0; i < topk_size; i++) {
+      fm_row_out_data[i] = fm_row_out_data[i] / topks[i];
+    }
+  }
+}
+
+template <typename T>
+void SequenceTopkAvgPoolingCompute<T>::PrepareForRun() {
+  int device_id;
+  cudaGetDevice(&device_id);
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, device_id);
+  _shared_mem_size = deviceProp.sharedMemPerBlock;
+}
+
+template <typename T>
+void SequenceTopkAvgPoolingCompute<T>::Run() {
+  auto &param = this->Param<param_t>();
+  auto &ctx = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = ctx.exec_stream();
+
+  CHECK(param.X->lod().size() > 0 && param.X->lod()[0].size() > 0)
+      << "X sequence offset is not valid";
+  CHECK(param.ROW->lod().size() > 0 && param.ROW->lod()[0].size() > 0)
+      << "ROW sequence offset is not valid";
+
+  int width_offset_len = param.X->lod()[0].size();
+  lite::DDim width_offset_shape(std::vector<int64_t>{width_offset_len});
+  _width_offset.Resize(width_offset_shape);
+  std::vector<int> width_lod_0(width_offset_len, 0);
+  for (size_t i = 0; i < param.X->lod()[0].size(); ++i) {
+    width_lod_0[i] = static_cast<int>(param.X->lod()[0][i]);
+  }
+  cudaMemcpyAsync(_width_offset.mutable_data<int>(TARGET(kCUDA)),
+                  &width_lod_0[0],
+                  sizeof(int) * width_offset_len,
+                  cudaMemcpyHostToDevice,
+                  cuda_stream);
+
+  int height_offset_len = param.ROW->lod()[0].size();
+  lite::DDim height_offset_shape(std::vector<int64_t>{height_offset_len});
+  _height_offset.Resize(height_offset_shape);
+  std::vector<int> height_lod_0(height_offset_len, 0);
+  for (size_t i = 0; i < param.ROW->lod()[0].size(); ++i) {
+    height_lod_0[i] = static_cast<int>(param.ROW->lod()[0][i]);
+  }
+  cudaMemcpyAsync(_height_offset.mutable_data<int>(TARGET(kCUDA)),
+                  &height_lod_0[0],
+                  sizeof(int) * height_offset_len,
+                  cudaMemcpyHostToDevice,
+                  cuda_stream);
+
+  const Tensor *x_tensor = param.X;
+  Tensor *out_tensor = param.Out;
+  const T *in_data = x_tensor->data<T>();
+  T *out_data = out_tensor->mutable_data<T>(TARGET(kCUDA));
+  TargetWrapperCuda::MemsetAsync(
+      out_data, 0, sizeof(T) * param.Out->numel(), cuda_stream);
+
+  int topk_num = param.topks.size();
+  lite::DDim top_ks_shape(std::vector<int64_t>{topk_num, 1, 1, 1});
+  _top_ks.Resize(top_ks_shape);
+  cudaMemcpyAsync(_top_ks.mutable_data<int>(TARGET(kCUDA)),
+                  &param.topks[0],
+                  sizeof(int) * topk_num,
+                  cudaMemcpyHostToDevice,
+                  cuda_stream);
+
+  int num = param.X->dims()[0];
+  int channel = param.X->dims()[1];
+  int height = param.X->dims()[2];
+  int width = param.X->dims()[3];
+
+  const int *height_offset = _height_offset.data<int>();
+  const int *width_offset = _width_offset.data<int>();
+
+  int feat_map_size = height * width;
+
+  if (feat_map_size * sizeof(T) <= _shared_mem_size) {
+    dim3 blocks(num, channel);
+    dim3 threads(32, 1);
+
+    topk_avg_pooling_kernel_by_row_improve<
+        T><<<blocks, threads, feat_map_size * sizeof(T), cuda_stream>>>(
+        out_data,
+        in_data,
+        height_offset,
+        width_offset,
+        height,
+        width,
+        param.topks.size(),
+        _top_ks.data<int>(),
+        param.channel_num);
+  } else {
+    int actual_row = _shared_mem_size / width / sizeof(T);
+    int num_z = (height + actual_row - 1) / actual_row;
+    dim3 blocks(num, channel, num_z);
+    dim3 threads(32, 1);
+
+    topk_avg_pooling_kernel_for_big_data<
+        T><<<blocks, threads, actual_row * width * sizeof(T), cuda_stream>>>(
+        out_data,
+        in_data,
+        height_offset,
+        width_offset,
+        height,
+        width,
+        param.topks.size(),
+        _top_ks.data<int>(),
+        param.channel_num,
+        actual_row);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    sequence_topk_avg_pooling,
+    kCUDA,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::cuda::SequenceTopkAvgPoolingCompute<float>,
+    def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("ROW",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("COLUMN",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("pos",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/elementwise_add_compute.h b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
similarity index 60%
rename from lite/kernels/cuda/elementwise_add_compute.h
rename to lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
index 5c3fecc5d894aeea2bc5260b1815bbfa718eb5c6..6f4be12f0f5f488e4f5eda4f95bbcf0e781d76ea 100644
--- a/lite/kernels/cuda/elementwise_add_compute.h
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
@@ -13,38 +13,31 @@
 // limitations under the License.
 
 #pragma once
+#include <cudnn.h>
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/core/kernel.h"
-
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace cuda {
 
-class ElementwiseAddCompute
-    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+template <typename T>
+class SequenceTopkAvgPoolingCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
  public:
-  using param_t = operators::ElementwiseParam;
+  using param_t = operators::SequenceTopkAvgPoolingParam;
 
   void Run() override;
-  virtual ~ElementwiseAddCompute() = default;
-};
-
-class ElementwiseAddComputeNHWC
-    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::ElementwiseParam;
 
-  void Run() override;
-  virtual ~ElementwiseAddComputeNHWC() = default;
-};
+  void PrepareForRun() override;
 
-class ElementwiseAddComputeInt8
-    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::ElementwiseParam;
+  virtual ~SequenceTopkAvgPoolingCompute() = default;
 
-  void Run() override;
-  virtual ~ElementwiseAddComputeInt8() = default;
+ protected:
+  lite::Tensor _height_offset;
+  lite::Tensor _width_offset;
+  lite::Tensor _top_ks;
+  int _shared_mem_size;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/softmax_compute.cu b/lite/kernels/cuda/softmax_compute.cu
index 14ed391f7ff3c2a2df5cad4b00cd2e6e57d2673c..431bd6eb561f6213b4609b39e8c3f638fed8261a 100644
--- a/lite/kernels/cuda/softmax_compute.cu
+++ b/lite/kernels/cuda/softmax_compute.cu
@@ -21,6 +21,8 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;
 
+const int CUDA_NUM_THREADS = 512;
+
 extern __shared__ char tile[];
 template <typename dtype>
 __global__ void sharemem_softmax_kernel(int total_size,
@@ -149,6 +151,15 @@ __global__ void softmax_divid_output_kernel(int total_size,
   }
 }
 
+void SoftmaxCompute::PrepareForRun() {
+  int device_id;
+  cudaGetDevice(&device_id);
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, device_id);
+  sharedmem_size_ = deviceProp.sharedMemPerBlock;
+  max_dimsize_ = sharedmem_size_ / sizeof(float) / CUDA_NUM_THREADS;
+}
+
 void SoftmaxCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
@@ -163,45 +174,36 @@ void SoftmaxCompute::Run() {
   int outer_num = x_dims.Slice(0, axis).production();
   int inner_num = x_dims.Slice(axis + 1, x_rank).production();
   int total_threads = inner_num * outer_num;
-  int axis_size = x_dims[axis];
+  axis_size_ = x_dims[axis];
 
-  int device_id;
-  const int threads = 512;
+  const int threads = CUDA_NUM_THREADS;
   const int blocks = (total_threads + threads - 1) / threads;
-  cudaGetDevice(&device_id);
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, device_id);
-  size_t sharedmem_size = deviceProp.sharedMemPerBlock;
-  int max_dimsize = sharedmem_size / sizeof(float) / threads;
-
   auto input_data = param.x->data<float>();
   auto output_data = param.output->mutable_data<float>(TARGET(kCUDA));
-  if (axis_size <= max_dimsize) {
-    int use_sharemem_size = axis_size * threads * sizeof(float);
+  if (axis_size_ <= max_dimsize_) {
+    int use_sharemem_size = axis_size_ * threads * sizeof(float);
     sharemem_softmax_kernel<<<blocks, threads, use_sharemem_size, stream>>>(
         total_threads,
         input_data,
         output_data,
         inner_num,
         outer_num,
-        axis_size);
+        axis_size_);
   } else {
     //! re_alloc device memory
-    Tensor tmax_data;
-    Tensor tsum_data;
-    tmax_data.Resize({1, 1, 1, outer_num * inner_num});
-    tsum_data.Resize({1, 1, 1, outer_num * inner_num});
-    auto max_data = tmax_data.mutable_data<float>(TARGET(kCUDA));
-    auto sum_data = tsum_data.mutable_data<float>(TARGET(kCUDA));
+    tmax_data_.Resize({1, 1, 1, outer_num * inner_num});
+    tsum_data_.Resize({1, 1, 1, outer_num * inner_num});
+    auto max_data = tmax_data_.mutable_data<float>(TARGET(kCUDA));
+    auto sum_data = tsum_data_.mutable_data<float>(TARGET(kCUDA));
     //! firstly, get maximum data
-    float min_data = std::numeric_limits<float>::min();
+    float min_data = std::numeric_limits<float>::lowest();
     softmax_max_kernel<float><<<blocks, threads, 0, stream>>>(total_threads,
                                                               input_data,
                                                               max_data,
                                                               min_data,
                                                               inner_num,
                                                               outer_num,
-                                                              axis_size);
+                                                              axis_size_);
     //! then, compute exp and sum data
     softmax_sub_exp_sum_kernel<float><<<blocks, threads, 0, stream>>>(
         total_threads,
@@ -211,13 +213,13 @@ void SoftmaxCompute::Run() {
         sum_data,
         inner_num,
         outer_num,
-        axis_size);
+        axis_size_);
     //! last, compute divided output
     softmax_divid_output_kernel<float><<<blocks, threads, 0, stream>>>(
-        total_threads, output_data, sum_data, inner_num, outer_num, axis_size);
+        total_threads, output_data, sum_data, inner_num, outer_num, axis_size_);
   }
   cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
 }
 
 }  // namespace cuda
@@ -258,4 +260,5 @@ REGISTER_LITE_KERNEL(search_seq_softmax,
                 {LiteType::GetTensorTy(TARGET(kCUDA),
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNCHW))})
+    .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
diff --git a/lite/kernels/cuda/softmax_compute.h b/lite/kernels/cuda/softmax_compute.h
index 4acde4ab072390dd139c3e4e715f9ad288dc4ef8..e563b36178fa0824d77de9942c1ec1a0f0fbd94f 100644
--- a/lite/kernels/cuda/softmax_compute.h
+++ b/lite/kernels/cuda/softmax_compute.h
@@ -25,8 +25,16 @@ class SoftmaxCompute
  public:
   using param_t = operators::SoftmaxParam;
 
+  void PrepareForRun() override;
   void Run() override;
   virtual ~SoftmaxCompute() = default;
+
+ private:
+  lite::Tensor tmax_data_;
+  lite::Tensor tsum_data_;
+  size_t sharedmem_size_;
+  int max_dimsize_;
+  int axis_size_;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/transpose_compute.cu b/lite/kernels/cuda/transpose_compute.cu
index 0050e5e0f6d67f4eacaadc675b98417b9436b006..c5693c674c573d7c9f59034dd3c0985c9d94a22f 100644
--- a/lite/kernels/cuda/transpose_compute.cu
+++ b/lite/kernels/cuda/transpose_compute.cu
@@ -25,6 +25,7 @@ namespace cuda {
 void TransposeCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
 
   const lite::Tensor* X = param.x;
   lite::Tensor* Out = param.output;
@@ -39,8 +40,7 @@ void TransposeCompute::Run() {
   // NCHW -> NHWC
   if (axes.size() == 4 && axes[0] == 0 && axes[1] == 2 && axes[2] == 3 &&
       axes[3] == 1) {
-    lite::cuda::math::NCHW2NHWC(
-        dims[0], dims[1], dims[2] * dims[3], in, out, &ctx);
+    trans.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream);
     cudaError_t error = cudaGetLastError();
     if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
     return;
@@ -49,14 +49,13 @@ void TransposeCompute::Run() {
   // NHWC -> NCHW
   if (axes.size() == 4 && axes[0] == 0 && axes[1] == 3 && axes[2] == 1 &&
       axes[3] == 2) {
-    lite::cuda::math::NHWC2NCHW(
-        dims[0], dims[3], dims[1] * dims[2], in, out, &ctx);
+    trans.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream);
     cudaError_t error = cudaGetLastError();
     if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
     return;
   }
 
-  lite::cuda::math::Transpose(dims, axes, in, out, &ctx);
+  trans.transpose(out, in, dims, axes, &stream);
   cudaError_t error = cudaGetLastError();
   if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
diff --git a/lite/kernels/cuda/transpose_compute.h b/lite/kernels/cuda/transpose_compute.h
index f85f43993d60cc9dbe5e665a3b2b0fffcbcbc7c9..273d072231fb0608deb9ed729bdf153395ee983f 100644
--- a/lite/kernels/cuda/transpose_compute.h
+++ b/lite/kernels/cuda/transpose_compute.h
@@ -29,7 +29,7 @@ class TransposeCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
   virtual ~TransposeCompute() = default;
 
  private:
-  lite::Tensor axes_, dims_;
+  lite::cuda::math::Transpose<float> trans;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/transpose_compute_test.cc b/lite/kernels/cuda/transpose_compute_test.cc
index 517f761b61268d2c664f74bdb338ffb79f8841f8..bf0d803a14a5f0e47c96128b953ae72a18798205 100644
--- a/lite/kernels/cuda/transpose_compute_test.cc
+++ b/lite/kernels/cuda/transpose_compute_test.cc
@@ -238,7 +238,7 @@ TEST(transpose, normal) {
   lite::Tensor x, x_cpu, x_ref;
   lite::Tensor out, out_cpu, out_ref;
 
-  int C = 6, H = 7, W = 8;
+  int C = 3, H = 128, W = 128;
   std::vector<int> axes({2, 0, 1});
   x.Resize({C, H, W});
   out.Resize({W, C, H});
diff --git a/lite/kernels/cuda/var_conv_2d_compute.cu b/lite/kernels/cuda/var_conv_2d_compute.cu
index f2588a8f53b83363300000fca6ba8a11cf5d50b6..1417282dcba9751c583d69912dddbcd82ca28fe9 100644
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <functional>
 #include <memory>
 #include <vector>
 #include "lite/backends/cuda/math/gemm.h"
@@ -25,224 +26,149 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-const int CUDA_NUM_THREADS = 512;
-
-template <typename Dtype>
-__global__ void var_im2col_gpu_kernel(const int n,
-                                      const Dtype* data_im,
-                                      const int height,
-                                      const int width,
-                                      const int kernel_h,
-                                      const int kernel_w,
-                                      const int pad_h,
-                                      const int pad_w,
-                                      const int stride_h,
-                                      const int stride_w,
-                                      const int height_col,
-                                      const int width_col,
-                                      Dtype* data_col) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int index = idx; index < n; index += blockDim.x * gridDim.x) {
-    const int h_index = index / width_col;
-    const int h_col = h_index % height_col;
-    const int w_col = index % width_col;
-    const int c_im = h_index / height_col;
-    const int c_col = c_im * kernel_h * kernel_w;
-    const int h_offset = h_col * stride_h - pad_h;
-    const int w_offset = w_col * stride_w - pad_w;
-
-    Dtype* data_col_ptr = data_col;
-    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-    const Dtype* data_im_ptr = data_im;
-    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
+inline int ConvOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int pad_left,
+                          int pad_right,
+                          int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size =
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
+
+  return output_size;
+}
 
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        int h_im = h_offset + i;
-        int w_im = w_offset + j;
-        *data_col_ptr =
-            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
-                ? data_im_ptr[i * width + j]
-                : 0;
-        data_col_ptr += height_col * width_col;
-      }
+// Eliminate the effects of pad, support batch > 1.
+template <typename dtype>
+__global__ void eliminate_pad_effect(dtype* src,
+                                     const int64_t* offset,
+                                     const int num_batch,
+                                     const int batch_stride,
+                                     const int num_channel,
+                                     const int channel_stride,
+                                     const int num_height,
+                                     const int height_stride,
+                                     const int num_width,
+                                     const int width_stride,
+                                     const int count) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int thread_num = blockDim.x * gridDim.x;
+  for (tid = threadIdx.x + blockIdx.x * blockDim.x; tid < count;
+       tid += thread_num) {
+    int batch_id = tid / batch_stride;
+    int width_id = tid % num_width;
+    int cur_len = offset[batch_id + 1] - offset[batch_id];
+    if (width_id >= cur_len) {
+      src[tid] = 0.;
     }
   }
 }
 
-void VarConv2DCompute::var_im2col(const cudaStream_t& stream) {
+void VarConv2DCompute::PrepareForRun() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
   auto& param = this->Param<param_t>();
-  int input_channel = param.input_channel;
-  int kernel_h = param.kernel_h;
-  int kernel_w = param.kernel_w;
-  int stride_h = param.stride_h;
-  int stride_w = param.stride_w;
-  // auto* in_row = param.ROW;
-  // auto* in_col = param.COLUMN;
-  const auto* input = param.X;
-  auto* col = param.Col;
-
-  int batch = input->lod()[0].size() - 1;
-  const auto& bottom_offset = input->lod()[0];
-  // 2-D lod info.
-  // const auto& offset_x = in_col->lod()[0];
-  // const auto& offset_y = in_row->lod()[0];
-  const auto& offset_y = param.X->lod()[1];
-  const auto& offset_x = param.X->lod()[2];
-  // top offset is the whole size of each data sample
-  std::vector<uint64_t> top_offset;
-  int top_size = 0;
-  top_offset.push_back(top_size);
-  for (int b = 0; b < batch; ++b) {
-    int width = offset_x[b + 1] - offset_x[b];
-    int height = offset_y[b + 1] - offset_y[b];
-    int top_im_x = 0;
-    if (width == 0) {
-      top_im_x = 0;
-    } else {
-      top_im_x = (width - 1) / stride_w + 1;
-    }
-    int top_im_y = 0;
-    if (height == 0) {
-      top_im_y = 0;
-    } else {
-      top_im_y = (height - 1) / stride_h + 1;
-    }
-    int top_x = top_im_x * top_im_y;
-    int top_y = input_channel * kernel_h * kernel_w;
-    top_size += top_y * top_x;
-    top_offset.push_back(top_size);
+  conv_param_.x = const_cast<lite::Tensor*>(param.X);
+  conv_param_.var_length = true;
+
+  conv_param_.paddings.reset(new std::vector<int>);
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_h / 2));
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_h / 2));
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_w / 2));
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_w / 2));
+  conv_param_.dilations.reset(new std::vector<int>);
+  conv_param_.dilations->push_back(1);
+  conv_param_.dilations->push_back(1);
+  conv_param_.strides[0] = param.stride_h;
+  conv_param_.strides[1] = param.stride_w;
+  conv_param_.filter = const_cast<lite::Tensor*>(param.W);
+  conv_param_.filter->Resize({param.output_channel,
+                              param.input_channel,
+                              param.kernel_h,
+                              param.kernel_w});
+
+  conv_param_.output = param.Out;
+  std::vector<int64_t> output_shape(
+      {conv_param_.x->dims()[0], param.output_channel});
+  for (size_t i = 0; i < conv_param_.strides.size(); ++i) {
+    output_shape.push_back(
+        ConvOutputSize(conv_param_.x->dims()[i + 2],
+                       conv_param_.filter->dims()[i + 2],
+                       (*conv_param_.dilations.get())[i],
+                       (*conv_param_.paddings.get())[i * 2],
+                       (*conv_param_.paddings.get())[i * 2 + 1],
+                       conv_param_.strides[i]));
   }
-
-  LoD col_lod;
-  col_lod.push_back(top_offset);
-  col->set_lod(col_lod);
-  std::vector<int64_t> col_dims_vec{top_size};
-  col_dims_vec.push_back(1);
-  col->Resize(col_dims_vec);
-  auto* top_data = col->mutable_data<float>(TARGET(kCUDA));
-  const auto* bottom_data = input->data<float>();
-
-  for (int b = 0; b < batch; ++b) {
-    int t_offset = top_offset[b];
-    int b_offset = bottom_offset[b];
-    int width = offset_x[b + 1] - offset_x[b];
-    int height = offset_y[b + 1] - offset_y[b];
-    if (width == 0 || height == 0) {
-      continue;
-    }
-    int width_col = (width - 1) / stride_w + 1;
-    int height_col = (height - 1) / stride_h + 1;
-    const float* data_im = bottom_data + b_offset;
-    float* data_col = top_data + t_offset;
-
-    // We are going to launch channels * height_col * width_col kernels, each
-    // kernel responsible for copying a single-channel grid.
-    int num_kernels = height_col * width_col * input_channel;
-    const int CUDA_NUM_BLOCKS =
-        (num_kernels + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-    var_im2col_gpu_kernel<
-        float><<<CUDA_NUM_BLOCKS, CUDA_NUM_THREADS, 0, stream>>>(
-        num_kernels,
-        data_im,
-        height,
-        width,
-        kernel_h,
-        kernel_w,
-        ((stride_h - 1) * height + kernel_h - 1) / 2,
-        ((stride_w - 1) * width + kernel_w - 1) / 2,
-        stride_h,
-        stride_w,
-        height_col,
-        width_col,
-        data_col);
+  if (param.fuse_relu) {
+    conv_param_.activation_param.has_active = true;
+    conv_param_.activation_param.active_type = lite_api::ActivationType::kRelu;
   }
+  conv_param_.output->Resize({output_shape});
+  conv_impl_.reset(new lite::cuda::math::CudnnConv2D<PRECISION(kFloat)>);
+  conv_impl_->init(conv_param_, &context);
 }
 
 void VarConv2DCompute::Run() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
   auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  auto* bottom = param.X;
-  // auto* in_row = param.ROW;
-  // auto* in_col = param.COLUMN;
-  auto* w = param.W;
-  auto* top = param.Out;
-  auto* col = param.Col;
-  int output_channel = param.output_channel;
-  int input_channel = param.input_channel;
-  int kernel_h = param.kernel_h;
-  int kernel_w = param.kernel_w;
-  int stride_h = param.stride_h;
-  int stride_w = param.stride_w;
-
-  var_im2col(stream);
 
-  int batch = bottom->lod()[0].size() - 1;
-  const auto& col_offset = col->lod()[0];
-  // const auto& offset_x = in_col->lod()[0];
-  // const auto& offset_y = in_row->lod()[0];
-  const auto& offset_y = param.X->lod()[1];
-  const auto& offset_x = param.X->lod()[2];
-  std::vector<size_t> top_offset;
-  std::vector<int64_t> height_vector;
-  std::vector<int64_t> width_vector;
-  int top_size = 0;
-  top_offset.push_back(top_size);
-  for (int b = 0; b < batch; ++b) {
-    int width = offset_x[b + 1] - offset_x[b];
-    int height = offset_y[b + 1] - offset_y[b];
-    int top_im_x = 0;
-    if (width == 0) {
-      top_im_x = 0;
-    } else {
-      top_im_x = (width - 1) / stride_w + 1;
-    }
-    int top_im_y = 0;
-    if (height == 0) {
-      top_im_y = 0;
-    } else {
-      top_im_y = (height - 1) / stride_h + 1;
-    }
-    height_vector.push_back(top_im_y);
-    width_vector.push_back(top_im_x);
-    int top_im_size = top_im_y * top_im_x;
-    top_size += output_channel * top_im_size;
-    top_offset.push_back(top_size);
+  param.Out->set_lod(param.X->lod());
+  std::vector<int64_t> output_shape(
+      {conv_param_.x->dims()[0], param.output_channel});
+  for (size_t i = 0; i < conv_param_.strides.size(); ++i) {
+    output_shape.push_back(
+        ConvOutputSize(conv_param_.x->dims()[i + 2],
+                       conv_param_.filter->dims()[i + 2],
+                       (*conv_param_.dilations.get())[i],
+                       (*conv_param_.paddings.get())[i * 2],
+                       (*conv_param_.paddings.get())[i * 2 + 1],
+                       conv_param_.strides[i]));
   }
-
-  LoD top_lod;
-  top_lod.push_back(top_offset);
-  top->set_lod(top_lod);
-  std::vector<int64_t> top_dims_vec{top_size};
-  top_dims_vec.push_back(1);
-  top->Resize(top_dims_vec);
-
-  auto* top_data = top->mutable_data<float>(TARGET(kCUDA));
-  const auto* w_data = w->data<float>();
-  const auto* col_data = col->data<float>();
-
-  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
-  for (int b = 0; b < batch; ++b) {
-    int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
-    if (top_im_size == 0) {
-      continue;
-    }
-    float* out_data = top_data + top_offset[b];
-    const float* in_data = col_data + col->lod()[0][b];
-    gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
-    gemm_impl_->init(false,
-                     false,
-                     w->dims()[0],
-                     height_vector[b] * width_vector[b],
-                     input_channel * kernel_h * kernel_w,
-                     &ctx);
-    gemm_impl_->run(1., 0., w_data, in_data, out_data, &ctx);
+  conv_param_.output->Resize({output_shape});
+  conv_impl_->create(conv_param_, &context);
+  conv_impl_->run(conv_param_);
+
+  // Avoid situations where cascading conv does not support multiple batch
+  // calculations
+  float* out_data = param.Out->mutable_data<float>();
+  const int batch_num = output_shape[1] * output_shape[2] * output_shape[3];
+  std::vector<int64_t> lod(param.X->lod()[0].size(), 0);
+  for (size_t i = 0; i < param.X->lod()[0].size(); ++i) {
+    lod[i] = param.X->lod()[0][i];
   }
+  int count = std::accumulate(
+      output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+  int width_stride = 1;
+  int height_stride = output_shape[3];
+  int channel_stride = output_shape[2] * output_shape[3];
+  int batch_stride = output_shape[1] * output_shape[2] * output_shape[3];
+  int threads = 512;
+  int blocks = (count + threads - 1) / threads;
+
+  offset_.Resize({static_cast<int64_t>(lod.size())});
+  int64_t* d_offset = offset_.mutable_data<int64_t>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(d_offset,
+                                 lod.data(),
+                                 sizeof(int64_t) * lod.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  eliminate_pad_effect<float><<<blocks, threads, 0, stream>>>(out_data,
+                                                              d_offset,
+                                                              output_shape[0],
+                                                              batch_stride,
+                                                              output_shape[1],
+                                                              channel_stride,
+                                                              output_shape[2],
+                                                              height_stride,
+                                                              output_shape[3],
+                                                              width_stride,
+                                                              count);
 
   cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/var_conv_2d_compute.h b/lite/kernels/cuda/var_conv_2d_compute.h
index e0b8e30c509f9095960bee3720567c96a71e7336..6f6b74e2fe41eb60acb242caffb7312cdb66595d 100644
--- a/lite/kernels/cuda/var_conv_2d_compute.h
+++ b/lite/kernels/cuda/var_conv_2d_compute.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
+#include "lite/backends/cuda/math/cudnn_conv.h"
 #include "lite/core/kernel.h"
 
 namespace paddle {
@@ -25,10 +27,13 @@ class VarConv2DCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
   using param_t = operators::VarConv2DParam;
 
   void Run() override;
+  void PrepareForRun() override;
   virtual ~VarConv2DCompute() = default;
 
  private:
-  void var_im2col(const cudaStream_t& stream);
+  mutable operators::ConvParam conv_param_;
+  std::unique_ptr<lite::cuda::math::CudnnConv2D<PRECISION(kFloat)>> conv_impl_;
+  lite::Tensor offset_;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/yolo_box_compute.cu b/lite/kernels/cuda/yolo_box_compute.cu
index 0a00c06cbfb9200e45d48a59aa26f2350c2cf9ab..6b4b2875f39c479f3ddd387230dbdf8e3d24ce3c 100644
--- a/lite/kernels/cuda/yolo_box_compute.cu
+++ b/lite/kernels/cuda/yolo_box_compute.cu
@@ -233,7 +233,7 @@ REGISTER_LITE_KERNEL(yolo_box,
                                       DATALAYOUT(kNCHW))})
     .BindInput("ImgSize",
                {LiteType::GetTensorTy(TARGET(kCUDA),
-                                      PRECISION(kFloat),
+                                      PRECISION(kInt32),
                                       DATALAYOUT(kNCHW))})
     .BindOutput("Boxes",
                 {LiteType::GetTensorTy(TARGET(kCUDA),
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
old mode 100644
new mode 100755
index dc8860188043dde6538a303eef82617e46c2a6c9..f6c3a399490a86e2ac2fcd9cbeb76fca8c8ac479
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -1,32 +1,45 @@
-if (NOT LITE_WITH_FPGA)
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_FPGA))
     return()
 endif()
 
 set(fpga_deps fpga_target_wrapper kernel_fpga)
 
-add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps})
 
+# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
+# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
+# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
 add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps})
-
+# add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
+add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
 add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps})
-
+# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 
+add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
+add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
+# add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
+add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
+add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
+# add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
 add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps})
-
+add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
+# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
+# add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
 add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
-
-add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps})
-
-add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps})
+# add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
+# add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps})
 
 add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps})
 add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps})
 add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_deps})
 add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
 add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
+
+# add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps})
+# add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps})
+
+# lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps})
+lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps})
+lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps})
+lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps})
+lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps})
+# lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps})
diff --git a/lite/kernels/fpga/calib_compute.cc b/lite/kernels/fpga/calib_compute.cc
old mode 100644
new mode 100755
index 51902cb08ad15fa20e3b1853c44564982adc327f..26301be8dff80f0b039831f4411151c58fa50d19
--- a/lite/kernels/fpga/calib_compute.cc
+++ b/lite/kernels/fpga/calib_compute.cc
@@ -23,24 +23,24 @@ namespace lite {
 namespace kernels {
 namespace fpga {
 using float16 = zynqmp::float16;
+
 void CalibComputeFp32ToFP16::Run() {
   auto& param = this->Param<operators::CalibParam>();
   const auto* din = param.input->data<float>();
-  auto* dout = param.output->mutable_data<float16>(TARGET(kFPGA));
-
-  for (int i = 0; i < param.input->numel(); ++i) {
-    dout[i] = zynqmp::float_to_half(din[i]);
-  }
+  param.output->mutable_data<float16>();
+  param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
+  auto out_lod = param.output->mutable_lod();
+  *out_lod = param.input->lod();
   return;
 }
 
 void CalibComputeFP16ToFp32::Run() {
   auto& param = this->Param<operators::CalibParam>();
   const auto* din = param.input->data<float16>();
-  auto* dout = param.output->mutable_data<float>(TARGET(kFPGA));
-  for (int i = 0; i < param.input->numel(); ++i) {
-    dout[i] = zynqmp::half_to_float(din[i]);
-  }
+  auto* dout = param.output->mutable_data<float>();
+  param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
+  auto out_lod = param.output->mutable_lod();
+  *out_lod = param.input->lod();
   return;
 }
 
diff --git a/lite/kernels/fpga/concat_compute.cc b/lite/kernels/fpga/concat_compute.cc
new file mode 100755
index 0000000000000000000000000000000000000000..ad66e3098187e6bc1c0b11afb15609bbe91dcc2f
--- /dev/null
+++ b/lite/kernels/fpga/concat_compute.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/concat_compute.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void ConcatCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  param.output->mutable_data<float16>();
+
+  // ====================================================
+  zynqmp::ConcatParam& concat_param = pe_.param();
+  for (auto t : param.x) {
+    concat_param.inputs.push_back(t->ZynqTensor());
+  }
+  concat_param.output = param.output->ZynqTensor();
+  concat_param.axis = param.axis;
+  pe_.init();
+  pe_.apply();
+}
+
+void ConcatCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::ConcatParam& concat_param = pe_.param();
+  Debugger::get_instance().registerOutput("concat", concat_param.output);
+#endif
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(concat,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ConcatCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/concat_compute.h b/lite/kernels/fpga/concat_compute.h
new file mode 100755
index 0000000000000000000000000000000000000000..1b5426be33daf50aed3521a6c0dcae054ea7bac0
--- /dev/null
+++ b/lite/kernels/fpga/concat_compute.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/concat_op.h"
+
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/concat_pe.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class ConcatCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~ConcatCompute() = default;
+
+ private:
+  zynqmp::ConcatPE pe_;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc
index 3e06e103bba61937e48bb4d14eeedd493ab15bba..69e600a043389fcc36bda2906f38432f2771aaf8 100644
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -13,9 +13,12 @@
 // limitations under the License.
 
 #include "lite/kernels/fpga/conv_compute.h"
+#include <vector>
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 
+#include "lite/backends/fpga/KD/debugger.hpp"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -25,30 +28,79 @@ using float16 = zynqmp::float16;
 
 void ConvCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
-
-  // ====================================================
-  zynqmp::ConvParam& conv_param = pe_.param();
   param.output->mutable_data<float16>();
+  int pad_h = (*param.paddings)[0];
+  int pad_w = (*param.paddings)[2];
+  // ====================================================
+  if (param.x->ZynqTensor()->shape().channel() != 1 &&
+      param.groups == param.x->ZynqTensor()->shape().channel()) {
+    zynqmp::DepthwiseConvParam& conv_param = dw_conv_pe_.param();
+
+    conv_param.input = param.x->ZynqTensor();
+    conv_param.output = param.output->ZynqTensor();
+    conv_param.filter = param.filter->ZynqTensor();
+    conv_param.filter->setDataType(zynqmp::FP32);
+    conv_param.groups = param.groups;
+    conv_param.strides = param.strides;
+    conv_param.paddings = std::vector<int>({pad_h, pad_w});
+    conv_param.dilations = *param.dilations;
+    fill_scale_bias_const(&conv_param);
+    conv_param.bias()->copyFrom(param.bias->ZynqTensor());
+
+    if (param.fuse_relu) {
+      conv_param.activeParam.type = zynqmp::TYPE_RELU;
+    }
 
-  // filter_.setDataType(zynqmp::FP32);
-  conv_param.input = param.x->ZynqTensor();
-  conv_param.output = param.output->ZynqTensor();
-  conv_param.filter = param.filter->ZynqTensor();
-  conv_param.groups = param.groups;
-  conv_param.strides = param.strides;
-  conv_param.paddings = param.paddings;
-  conv_param.dilations = param.dilations;
-  fill_scale_bias_const(&conv_param);
-  conv_param.bias()->copyFrom(param.bias->ZynqTensor());
-  conv_param.relu.enabled = param.fuse_relu;
-  pe_.init();
-  pe_.apply();
+    dw_conv_pe_.init();
+    dw_conv_pe_.apply();
+  } else {
+    zynqmp::ConvParam& conv_param = conv_pe_.param();
+    conv_param.input = param.x->ZynqTensor();
+    conv_param.output = param.output->ZynqTensor();
+    conv_param.filter = param.filter->ZynqTensor();
+    conv_param.filter->setDataType(zynqmp::FP32);
+    conv_param.groups = param.groups;
+    conv_param.strides = param.strides;
+    conv_param.paddings = std::vector<int>({pad_h, pad_w});
+    conv_param.dilations = *param.dilations;
+    fill_scale_bias_const(&conv_param);
+    if (param.bias != nullptr) {
+      conv_param.bias()->copyFrom(param.bias->ZynqTensor());
+    }
+
+    if (param.fuse_relu) {
+      conv_param.activeParam.type = zynqmp::TYPE_RELU;
+    }
+
+    // conv_param.filter->saveToFile("conv_filter_", true);
+    // if (param.bias != nullptr) {
+    //   std::cout << "param.bias != nullptr" << std::endl;
+    //   conv_param.bias()->saveToFile("conv_bias_", true);
+    // }
+
+    conv_pe_.init();
+    conv_pe_.apply();
+  }
 }
 
 void ConvCompute::Run() {
   auto& param = this->Param<param_t>();
-  zynqmp::ConvParam& conv_param = pe_.param();
-  pe_.dispatch();
+  if (param.x->ZynqTensor()->shape().channel() != 1 &&
+      param.groups == param.x->ZynqTensor()->shape().channel()) {
+    dw_conv_pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+    zynqmp::DepthwiseConvParam& dwconv_param = dw_conv_pe_.param();
+    Debugger::get_instance().registerOutput("dwconv", dwconv_param.output);
+#endif
+  } else {
+    // zynqmp::ConvParam& conv_param = conv_pe_.param();
+    conv_pe_.dispatch();
+
+#ifdef FPGA_PRINT_TENSOR
+    zynqmp::ConvParam& conv_param = conv_pe_.param();
+    Debugger::get_instance().registerOutput("conv", conv_param.output);
+#endif
+  }
 }
 
 }  // namespace fpga
@@ -69,3 +121,21 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(depthwise_conv2d,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ConvCompute,
+                     def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/conv_compute.h b/lite/kernels/fpga/conv_compute.h
old mode 100644
new mode 100755
index a023fb46fc8af0ad12d07929137f3eb058e92ef4..8c2b6c3704b8fe6620487bbfe94c50257b9fbcf9
--- a/lite/kernels/fpga/conv_compute.h
+++ b/lite/kernels/fpga/conv_compute.h
@@ -14,11 +14,13 @@
 
 #pragma once
 
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
 #include "lite/core/kernel.h"
 #include "lite/operators/conv_op.h"
 
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
+#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -36,7 +38,8 @@ class ConvCompute
   ~ConvCompute() {}
 
  private:
-  zynqmp::ConvPE pe_;
+  zynqmp::ConvPE conv_pe_;
+  zynqmp::DepthwiseConvPE dw_conv_pe_;
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/conv_compute_test.cc b/lite/kernels/fpga/conv_compute_test.cc
index f166974cc9f2fd856defd753e1e9131858d41252..7db855a0fe10d775cec07ad30e67f00c8230940a 100644
--- a/lite/kernels/fpga/conv_compute_test.cc
+++ b/lite/kernels/fpga/conv_compute_test.cc
@@ -143,11 +143,11 @@ void conv_compute_ref(const operators::ConvParam& param) {
   int kernel_h = param.filter->dims()[3];
   int stride_w = param.strides[0];
   int stride_h = param.strides[1];
-  int dila_w = param.dilations[0];
-  int dila_h = param.dilations[1];
+  int dila_w = (*param.dilations)[0];
+  int dila_h = (*param.dilations)[1];
 
-  int pad_w = param.paddings[0];
-  int pad_h = param.paddings[1];
+  int pad_w = (*param.paddings)[2];
+  int pad_h = (*param.paddings)[0];
   bool flag_bias = (param.bias != nullptr);
   bool flag_relu = param.fuse_relu;
 
@@ -277,9 +277,10 @@ TEST(conv_fpga, compute) {
                             param.bias = &bias;
                           }
                           param.fuse_relu = flag_relu;
-                          param.paddings = std::vector<int>({padding, padding});
+                          *param.paddings = std::vector<int>(
+                              {padding, padding, padding, padding});
                           param.strides = std::vector<int>({stride, stride});
-                          param.dilations =
+                          *param.dilations =
                               std::vector<int>({dilation, dilation});
                           param.groups = group;
                           conv.SetParam(param);
diff --git a/lite/kernels/fpga/dropout_compute.cc b/lite/kernels/fpga/dropout_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b047a643f5cfda594a1bb56278c59f0371914e96
--- /dev/null
+++ b/lite/kernels/fpga/dropout_compute.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/dropout_compute.h"
+#include <string>
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+#include "lite/backends/fpga/KD/float16.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+void DropoutCompute::PrepareForRun() {
+  auto& param = Param<operators::DropoutParam>();
+  param.output->mutable_data<float16>();
+
+  zynqmp::ScaleParam& scale_param = pe_.param();
+  scale_param.input = param.x->ZynqTensor();
+  scale_param.output = param.output->ZynqTensor();
+
+  int channel = scale_param.input->shape().channel();
+  zynqmp::Tensor* scale = new zynqmp::Tensor();
+  zynqmp::Tensor* bias = new zynqmp::Tensor();
+  zynqmp::Shape shape(zynqmp::N, {channel});
+  float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
+  float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
+
+  float scale_value = 1 - param.dropout_prob;
+  for (int i = 0; i < channel; ++i) {
+    scale_data[i] = scale_value;
+    bias_data[i] = 0.0f;
+  }
+  scale->flush();
+  bias->flush();
+
+  scale_param.bias = bias;
+  scale_param.scale = scale;
+
+  pe_.init();
+  pe_.apply();
+}
+
+void DropoutCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::ScaleParam& scale_param = pe_.param();
+  Debugger::get_instance().registerOutput("dropout", scale_param.output);
+#endif
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(dropout,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::DropoutCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/npu/graph_compute.h b/lite/kernels/fpga/dropout_compute.h
old mode 100644
new mode 100755
similarity index 58%
rename from lite/kernels/npu/graph_compute.h
rename to lite/kernels/fpga/dropout_compute.h
index b289b8e42f49e347fe72c5f9f37ea80bc30fc6a2..76f640855c56660007cc1d0eba27b3971a08e9d9
--- a/lite/kernels/npu/graph_compute.h
+++ b/lite/kernels/fpga/dropout_compute.h
@@ -13,42 +13,33 @@
 // limitations under the License.
 
 #pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
+#include <algorithm>
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
-#include "lite/core/types.h"
+
+#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace npu {
+namespace fpga {
 
-class GraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> {
- public:
-  using param_t = operators::GraphParam;
+using float16 = zynqmp::float16;
 
+class DropoutCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
   void PrepareForRun() override;
 
   void Run() override;
 
-  virtual ~GraphCompute() = default;
+  virtual ~DropoutCompute() = default;
 
  private:
-  std::shared_ptr<hiai::AiModelMngerClient> model_client_;
-  std::string model_name_;
-  hiai::AiContext model_context_;
-
-  std::vector<int64_t> npu_idatasizes_;
-  std::vector<int64_t> npu_odatasizes_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> npu_itensors_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> npu_otensors_;
+  zynqmp::ScalePE pe_;
 };
 
-}  // namespace npu
+}  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc
old mode 100644
new mode 100755
index 2a12650ef11ca996e8cff96c4d8e54d42e2020f4..d22cc7abacc2ecd80e54aa5c62a7e57671b920c9
--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/fpga/elementwise_compute.h"
 #include <string>
 #include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 
 namespace paddle {
 namespace lite {
@@ -32,12 +33,19 @@ void ElementwiseAddCompute::PrepareForRun() {
   ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
   ew_param.output = param.Out->ZynqTensor();
   ew_param.axis = param.axis;
-  ew_param.relu.enabled = false;
+
+  ew_param.activeParam.type = zynqmp::TYPE_NONE;
 
   pe_.init();
   pe_.apply();
 }
-void ElementwiseAddCompute::Run() { pe_.dispatch(); }
+void ElementwiseAddCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::ElementwiseAddParam& ew_param = pe_.param();
+  Debugger::get_instance().registerOutput("ew_add", ew_param.output);
+#endif
+}
 
 void ElementwiseAddActivationCompute::PrepareForRun() {
   zynqmp::ElementwiseAddParam& ew_param = pe_.param();
@@ -49,11 +57,58 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
   ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
   ew_param.output = param.Out->ZynqTensor();
   ew_param.axis = param.axis;
-  ew_param.relu.enabled = true;
+  ew_param.activeParam.type = zynqmp::TYPE_RELU;
+  pe_.init();
+  pe_.apply();
+}
+void ElementwiseAddActivationCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::ElementwiseAddParam& ew_param = pe_.param();
+  Debugger::get_instance().registerOutput("ew_add", ew_param.output);
+#endif
+}
+
+void ElementwiseMulCompute::PrepareForRun() {
+  zynqmp::ScaleParam& scale_param = pe_.param();
+  auto& param = Param<operators::ElementwiseParam>();
+  param.Out->mutable_data<float16>();
+
+  scale_param.input = param.X->ZynqTensor();
+  scale_param.output = param.Out->ZynqTensor();
+
+  scale_param.activeParam.type = zynqmp::TYPE_NONE;
+
+  int channel = scale_param.input->shape().channel();
+  zynqmp::Tensor* scale = new zynqmp::Tensor();
+  zynqmp::Tensor* bias = new zynqmp::Tensor();
+  scale_param.scale = scale;
+  scale_param.bias = bias;
+  zynqmp::Shape shape(zynqmp::N, {channel});
+  float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
+  float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
+  float scale_value = param.Y->data<float>()[0];
+
+  for (int i = 0; i < channel; ++i) {
+    if (param.Y->dims().production() != 1) {
+      scale_value = param.Y->ZynqTensor()->data<float>()[i];
+    }
+    scale_data[i] = scale_value;
+    bias_data[i] = 0;
+  }
+
   pe_.init();
   pe_.apply();
 }
-void ElementwiseAddActivationCompute::Run() { pe_.dispatch(); }
+
+void ElementwiseMulCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::ScaleParam& scale_param = pe_.param();
+  Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
+  Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
+#endif
+}
 
 }  // namespace fpga
 }  // namespace kernels
@@ -100,3 +155,20 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ElementwiseMulCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/elementwise_compute.h b/lite/kernels/fpga/elementwise_compute.h
index 7051dd7eeda02537be713ff042a0cf33ac1b618d..e3e9c52c4c660e9ae6852f2ec8cdd815829ad524 100644
--- a/lite/kernels/fpga/elementwise_compute.h
+++ b/lite/kernels/fpga/elementwise_compute.h
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 
@@ -50,6 +51,18 @@ class ElementwiseAddActivationCompute
   zynqmp::ElementwiseAddPE pe_;
 };
 
+class ElementwiseMulCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~ElementwiseMulCompute() = default;
+
+ private:
+  zynqmp::ScalePE pe_;
+};
+
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/fpga/elementwise_compute_test.cc b/lite/kernels/fpga/elementwise_compute_test.cc
index add60f64602105d317c3657985c0011aff246608..51c9d54cad9054e4767860096d42e9c991d4f936 100644
--- a/lite/kernels/fpga/elementwise_compute_test.cc
+++ b/lite/kernels/fpga/elementwise_compute_test.cc
@@ -93,18 +93,25 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param,
   }
   // do elementwise add/sub/max...
   if (elt_type == "add") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = sum(*din_ptr, diny_data);
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
+    // for (int i = 0; i < batch; ++i) {
+    //   for (int j = 0; j < channels; ++j) {
+    //     int offset = (i * channels + j) * num;
+    //     const dtype* din_ptr = x_data + offset;
+    //     const dtype diny_data = y_data[j];
+    //     dtype* dout_ptr = out_data + offset;
+    //     for (int k = 0; k < num; ++k) {
+    //       *dout_ptr =
+    //       zynqmp::float_to_half(sum(zynqmp::half_to_float(*din_ptr),
+    //       zynqmp::half_to_float(diny_data)));
+    //       dout_ptr++;
+    //       din_ptr++;
+    //     }
+    //   }
+    // }
+    int count = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
+    for (int i = 0; i < count; ++i) {
+      out_data[i] = zynqmp::float_to_half(sum(
+          zynqmp::half_to_float(x_data[i]), zynqmp::half_to_float(y_data[i])));
     }
   } else if (elt_type == "sub") {
     for (int i = 0; i < batch; ++i) {
@@ -148,9 +155,9 @@ TEST(elementwise_add, compute) {
   lite::Tensor x, y, output, output_ref;
 
   for (auto n : {1}) {
-    for (auto c : {8}) {
-      for (auto h : {8}) {
-        for (auto w : {8}) {
+    for (auto h : {72}) {
+      for (auto w : {192}) {
+        for (auto c : {24}) {
           for (auto axis : {0}) {
             for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
               auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
@@ -174,10 +181,16 @@ TEST(elementwise_add, compute) {
               auto* output_ref_data =
                   output_ref.mutable_data<float16>(TARGET(kFPGA));
               for (int i = 0; i < x_dim.production(); i++) {
-                x_data[i] = zynqmp::float_to_half(i);
+                float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                float x = sign * (i % 128);
+                std::cout << "x:" << x << std::endl;
+                x_data[i] = zynqmp::float_to_half(x);
               }
               for (int i = 0; i < y_dim.production(); i++) {
-                y_data[i] = zynqmp::float_to_half(i);
+                float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                float y = sign * (i % 128);
+                std::cout << "y:" << y << std::endl;
+                y_data[i] = zynqmp::float_to_half(y);
               }
               param.X = &x;
               param.Y = &y;
@@ -190,7 +203,14 @@ TEST(elementwise_add, compute) {
 
               elementwise_compute_ref<float16>(param, "add", "");
               for (int i = 0; i < output.dims().production(); i++) {
-                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+                std::cout << "output_data:"
+                          << zynqmp::half_to_float(output_data[i])
+                          << ",output_ref_data:"
+                          << zynqmp::half_to_float(output_ref_data[i])
+                          << std::endl;
+                EXPECT_NEAR(zynqmp::half_to_float(output_data[i]),
+                            zynqmp::half_to_float(output_ref_data[i]),
+                            1e-5);
               }
             }
           }
@@ -209,73 +229,74 @@ TEST(fusion_elementwise_add_activation_fpga, retrive_op) {
   ASSERT_TRUE(fusion_elementwise_add_activation.front());
 }
 
-TEST(fusion_elementwise_add_activation_fpga, init) {
-  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-  ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
-  ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
-}
+// TEST(fusion_elementwise_add_activation_fpga, init) {
+//   ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+//   ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
+//   ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
+// }
 
-TEST(fusion_elementwise_add_activation_fpga, compute) {
-  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-  operators::FusionElementwiseActivationParam param;
-  lite::Tensor x, y, output, output_ref;
+// TEST(fusion_elementwise_add_activation_fpga, compute) {
+//   ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+//   operators::FusionElementwiseActivationParam param;
+//   lite::Tensor x, y, output, output_ref;
 
-  for (auto act_type : {"relu"}) {
-    for (auto n : {1}) {
-      for (auto c : {8}) {
-        for (auto h : {8}) {
-          for (auto w : {8}) {
-            for (auto axis : {0}) {
-              for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
-                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-                auto y_dim = DDim(yd);
-                int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
+//   for (auto act_type : {"relu"}) {
+//     for (auto n : {1}) {
+//       for (auto c : {8}) {
+//         for (auto h : {8}) {
+//           for (auto w : {8}) {
+//             for (auto axis : {0}) {
+//               for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
+//                 auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+//                 auto y_dim = DDim(yd);
+//                 int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
 
-                if (axis_t + y_dim.size() > 4) continue;
-                bool flag = false;
-                for (int i = 0; i < y_dim.size(); i++) {
-                  if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-                }
-                if (flag) continue;
+//                 if (axis_t + y_dim.size() > 4) continue;
+//                 bool flag = false;
+//                 for (int i = 0; i < y_dim.size(); i++) {
+//                   if (x_dim[i + axis_t] != y_dim[i]) flag = true;
+//                 }
+//                 if (flag) continue;
 
-                x.Resize(x_dim);
-                y.Resize(y_dim);
-                output.Resize(x_dim);
-                output_ref.Resize(x_dim);
-                auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
-                auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
-                auto* output_data = output.mutable_data<float16>(TARGET(kFPGA));
-                auto* output_ref_data =
-                    output_ref.mutable_data<float16>(TARGET(kFPGA));
-                for (int i = 0; i < x_dim.production(); i++) {
-                  float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                  x_data[i] = zynqmp::float_to_half(i * sign);
-                }
-                for (int i = 0; i < y_dim.production(); i++) {
-                  float sign = i % 2 == 0 ? 0.5f : -0.5f;
-                  y_data[i] = zynqmp::float_to_half(i * sign);
-                }
-                param.X = &x;
-                param.Y = &y;
-                param.axis = axis;
-                param.Out = &output;
-                param.act_type = act_type;
-                fusion_elementwise_add_activation.SetParam(param);
-                fusion_elementwise_add_activation.PrepareForRun();
-                fusion_elementwise_add_activation.Run();
-                param.Out = &output_ref;
-                elementwise_compute_ref<float16>(param, "add", act_type);
-                for (int i = 0; i < output.dims().production(); i++) {
-                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
+//                 x.Resize(x_dim);
+//                 y.Resize(y_dim);
+//                 output.Resize(x_dim);
+//                 output_ref.Resize(x_dim);
+//                 auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
+//                 auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
+//                 auto* output_data =
+//                 output.mutable_data<float16>(TARGET(kFPGA));
+//                 auto* output_ref_data =
+//                     output_ref.mutable_data<float16>(TARGET(kFPGA));
+//                 for (int i = 0; i < x_dim.production(); i++) {
+//                   float sign = i % 3 == 0 ? -1.0f : 1.0f;
+//                   x_data[i] = zynqmp::float_to_half(i * sign);
+//                 }
+//                 for (int i = 0; i < y_dim.production(); i++) {
+//                   float sign = i % 2 == 0 ? 0.5f : -0.5f;
+//                   y_data[i] = zynqmp::float_to_half(i * sign);
+//                 }
+//                 param.X = &x;
+//                 param.Y = &y;
+//                 param.axis = axis;
+//                 param.Out = &output;
+//                 param.act_type = act_type;
+//                 fusion_elementwise_add_activation.SetParam(param);
+//                 fusion_elementwise_add_activation.PrepareForRun();
+//                 fusion_elementwise_add_activation.Run();
+//                 param.Out = &output_ref;
+//                 elementwise_compute_ref<float16>(param, "add", act_type);
+//                 for (int i = 0; i < output.dims().production(); i++) {
+//                   EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+//                 }
+//               }
+//             }
+//           }
+//         }
+//       }
+//     }
+//   }
+// }
 
 }  // namespace fpga
 }  // namespace kernels
@@ -283,4 +304,4 @@ TEST(fusion_elementwise_add_activation_fpga, compute) {
 }  // namespace paddle
 
 USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
+// USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
diff --git a/lite/kernels/fpga/fc_compute.cc b/lite/kernels/fpga/fc_compute.cc
index dca6dbce16c082fb14cefce6ec4da2e53c61e8e0..0c76bf0b41e45ad0bcaa10e97011e26449a3ad7d 100644
--- a/lite/kernels/fpga/fc_compute.cc
+++ b/lite/kernels/fpga/fc_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/fpga/fc_compute.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 
@@ -30,7 +31,6 @@ void FcCompute::PrepareForRun() {
   zynqmp::FullyConnectedParam& fc_param = pe_.param();
 
   param.output->mutable_data<float16>();
-
   fc_param.input = param.input->ZynqTensor();
   fc_param.output = param.output->ZynqTensor();
   fc_param.filter = param.w->ZynqTensor();
@@ -41,8 +41,11 @@ void FcCompute::PrepareForRun() {
 }
 
 void FcCompute::Run() {
-  auto& param = this->Param<param_t>();
   pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::FullyConnectedParam& fc_param = pe_.param();
+  Debugger::get_instance().registerOutput("fc", fc_param.output);
+#endif
 }
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/fc_compute.h b/lite/kernels/fpga/fc_compute.h
old mode 100644
new mode 100755
index f20419f02b91d02d8d7c6ec7a573a4ead23b3ba7..4cdd1a1abb54fb31962306fde6114a0c51fc10ed
--- a/lite/kernels/fpga/fc_compute.h
+++ b/lite/kernels/fpga/fc_compute.h
@@ -37,10 +37,6 @@ class FcCompute
 
  private:
   zynqmp::FullyConnectedPE pe_;
-  zynqmp::Tensor input_;
-  zynqmp::Tensor output_;
-  zynqmp::Tensor filter_;
-  zynqmp::Tensor bias_;
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc
old mode 100644
new mode 100755
index 29c080888bb3cbba80e84278b0399b5eadbfa27f..79329e99a3e5e812dca487c17452f3f5d1e96449
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/fpga/feed_compute.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 
@@ -25,21 +26,29 @@ using float16 = zynqmp::float16;
 
 void FeedCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
-  // ====================================================
-  zynqmp::InputParam& conv_param = pe_.param();
   Tensor& x = param.feed_list->at(param.col);
-
   param.out->Resize(x.dims());
   param.out->mutable_data<float16>();
-  conv_param.input = x.ZynqTensor();
-  conv_param.output = param.out->ZynqTensor();
+  // ====================================================
+  zynqmp::InputParam& feed_param = pe_.param();
+  feed_param.input = x.ZynqTensor();
+  feed_param.output = param.out->ZynqTensor();
   pe_.init();
   pe_.apply();
 }
 
 void FeedCompute::Run() {
   auto& param = this->Param<param_t>();
+  Tensor& x = param.feed_list->at(param.col);
   pe_.dispatch();
+
+  auto out_lod = param.out->mutable_lod();
+  *out_lod = x.lod();
+
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::InputParam& feed_param = pe_.param();
+  Debugger::get_instance().registerOutput("feed", feed_param.output);
+#endif
 }
 
 }  // namespace fpga
@@ -50,7 +59,7 @@ void FeedCompute::Run() {
 REGISTER_LITE_KERNEL(
     feed, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::FeedCompute, def)
     .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
+               {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNHWC))})
     .BindOutput("Out",
@@ -58,3 +67,13 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(feed,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::FeedCompute,
+                     def_host)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/fpga/feed_compute.h b/lite/kernels/fpga/feed_compute.h
old mode 100644
new mode 100755
index a15ba5636f1d767129ff42f79379670c840a1aa2..5d74b4a0660deafa5b0815ff5ac048c36301f615
--- a/lite/kernels/fpga/feed_compute.h
+++ b/lite/kernels/fpga/feed_compute.h
@@ -32,8 +32,6 @@ class FeedCompute
 
  private:
   zynqmp::InputPE pe_;
-  zynqmp::Tensor input_;
-  zynqmp::Tensor output_;
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc
old mode 100644
new mode 100755
index cf4cf2d3e6d50aaff5c4a77bc7f6d4bf2d539020..2d296f4d4a89b1fd86e5b2330d3caf44fbad0903
--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "lite/kernels/fpga/fetch_compute.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 
@@ -25,35 +26,62 @@ using float16 = zynqmp::float16;
 void FetchCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   // ====================================================
-  zynqmp::OutputParam& conv_param = pe_.param();
+  zynqmp::OutputParam& fetch_param = pe_.param();
   auto fetch_list = param.fetch_list;
   if (fetch_list->size() <= static_cast<size_t>(param.col)) {
     fetch_list->resize(param.col + 1);
   }
   Tensor& out = param.fetch_list->at(param.col);
   out.Resize(param.input->dims());
-  out.mutable_data<float16>();
+  out.mutable_data<float>();
 
-  conv_param.input = param.input->ZynqTensor();
-  conv_param.output = out.ZynqTensor();
+  fetch_param.input = param.input->ZynqTensor();
+  fetch_param.output = out.ZynqTensor();
 
   pe_.init();
   pe_.apply();
 }
 
-void FetchCompute::Run() { pe_.dispatch(); }
+void FetchCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto fetch_list = param.fetch_list;
+  if (fetch_list->size() <= static_cast<size_t>(param.col)) {
+    fetch_list->resize(param.col + 1);
+  }
+  Tensor& out = param.fetch_list->at(param.col);
+  out.Resize(param.input->dims());
+  pe_.dispatch();
+
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::OutputParam& fetch_param = pe_.param();
+  Debugger::get_instance().registerOutput("fetch", fetch_param.output);
+#endif
+}
 
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    fetch, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::FetchCompute, def)
+REGISTER_LITE_KERNEL(fetch,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::FetchCompute,
+                     fpga_host)
     .BindInput("X",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fetch,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::FetchCompute,
+                     host_host)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
diff --git a/lite/kernels/fpga/fetch_compute.h b/lite/kernels/fpga/fetch_compute.h
old mode 100644
new mode 100755
index d86fa905c422dc759b23b3b8603e3b4436c5bdf4..b6804f0a6a4b624a9b7968d5121e9d5440202adb
--- a/lite/kernels/fpga/fetch_compute.h
+++ b/lite/kernels/fpga/fetch_compute.h
@@ -31,8 +31,6 @@ class FetchCompute
 
  private:
   zynqmp::OutputPE pe_;
-  zynqmp::Tensor input_;
-  zynqmp::Tensor output_;
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/gru_compute.cc b/lite/kernels/fpga/gru_compute.cc
new file mode 100755
index 0000000000000000000000000000000000000000..a157382a6fb7ac39e4b102f5ac65dea337ed0f13
--- /dev/null
+++ b/lite/kernels/fpga/gru_compute.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <unistd.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/gru_utils.h"
+#include "lite/backends/arm/math/sequence2batch.h"
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/fpga/gru_compute.h"
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+#include "lite/backends/fpga/KD/pes/gru_util.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+inline lite_api::ActivationType get_gru_act_type(const std::string& type) {
+  if (type == "sigmoid") {
+    return lite_api::ActivationType::kSigmoid;
+  } else if (type == "tanh") {
+    return lite_api::ActivationType::kTanh;
+  } else if (type == "relu") {
+    return lite_api::ActivationType::kRelu;
+  } else if (type == "identity") {
+    return lite_api::ActivationType::kIndentity;
+  } else {
+    LOG(FATAL) << "unsupported activation type: " << type;
+  }
+}
+
+void GRUCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  param.hidden->mutable_data<float>();
+
+  auto input = param.input;
+  auto h0 = param.h0;
+  auto weight = param.weight;
+  auto bias = param.bias;
+
+  zynqmp::GRUParam& gru_param = pe_.param();
+  gru_param.input = input->ZynqTensor();
+  if (h0 != nullptr) {
+    gru_param.h0 = h0->ZynqTensor();
+  }
+  gru_param.weight = weight->ZynqTensor();
+  gru_param.bias = bias->ZynqTensor();
+
+  gru_param.batch_gate = param.batch_gate->ZynqTensor();
+  gru_param.batch_reset_hidden_prev =
+      param.batch_reset_hidden_prev->ZynqTensor();
+  gru_param.batch_hidden = param.batch_hidden->ZynqTensor();
+  gru_param.hidden = param.hidden->ZynqTensor();
+
+  gru_param.gate_activation = param.gate_activation;
+  gru_param.activation = param.activation;
+
+  pe_.init();
+  pe_.apply();
+}
+
+void GRUCompute::Run() {
+  auto& param = this->Param<param_t>();
+  param.hidden->mutable_data<float>();
+
+  // inputs
+  auto input = param.input;
+  auto h0 = param.h0;
+  auto weight = param.weight;
+  auto bias = param.bias;
+  // outputs
+  auto batch_gate = param.batch_gate;
+  auto batch_reset_hidden_prev = param.batch_reset_hidden_prev;
+  auto batch_hidden = param.batch_hidden;
+  auto hidden = param.hidden;
+
+  auto hidden_dims = hidden->dims();
+  int frame_size = hidden_dims[1];
+  auto batch_size = input->dims()[0];
+
+  const float* weight_data = weight->data<float>();
+  float* batch_gate_data = batch_gate->mutable_data<float>();
+
+  lite::arm::math::LoDTensor2BatchFunctor<float> to_batch;
+  to_batch(*input, batch_gate, true, param.is_reverse);  // 1.
+
+  if (bias) {
+    auto bias_data = bias->data<float>();  // 2.
+    lite::arm::math::gru_add_with_bias(batch_gate_data,
+                                       bias_data,
+                                       batch_gate_data,
+                                       batch_size,
+                                       frame_size * 3);
+  }
+
+  zynqmp::GRUTensors gru_tensors;
+  lite::arm::math::GRUMetaValue<float> gru_value;
+  gru_value.gate_weight = const_cast<float*>(weight_data);
+  gru_value.state_weight =
+      const_cast<float*>(weight_data + 2 * frame_size * frame_size);
+
+  Tensor ordered_h0;
+  std::vector<uint64_t> order(batch_gate->lod()[2]);
+
+  if (h0) {
+    // Since the batch computing for GRU reorders the input sequences
+    // according to their length. The initialized cell state also needs
+    // to reorder.
+    // lite::arm::math::ReorderInitState<float>(*h0, order, &ordered_h0, true);
+    // //3.
+    gru_value.prev_out_value = ordered_h0.mutable_data<float>();
+    gru_tensors.pre_output = ordered_h0.ZynqTensor();
+
+  } else {
+    gru_value.prev_out_value = nullptr;
+    gru_tensors.pre_output = nullptr;
+  }
+  auto batch_starts = batch_gate->lod()[0];
+  size_t seq_len = batch_starts.size() - 1;
+  auto active_node = get_gru_act_type(param.activation);
+  auto active_gate = get_gru_act_type(param.gate_activation);
+
+  save_float(gru_value.gate_weight, "_gate_weight.txt", weight->numel());
+  batch_gate->ZynqTensor()->saveToFile("batch_gate.txt");
+
+  zynqmp::Tensor float_input;
+  zynqmp::Tensor hidden_out;
+
+  for (size_t n = 0; n < seq_len; n++) {
+    int bstart = static_cast<int>(batch_starts[n]);
+    int bend = static_cast<int>(batch_starts[n + 1]);
+    int cur_batch_size = bend - bstart;
+
+    gru_value.output_value =
+        batch_hidden->mutable_data<float>() + bstart * batch_hidden->dims()[1];
+    gru_value.gate_value =
+        batch_gate->mutable_data<float>() + bstart * batch_gate->dims()[1];
+    gru_value.reset_output_value =
+        batch_reset_hidden_prev->mutable_data<float>() +
+        bstart * batch_reset_hidden_prev->dims()[1];
+
+    zynqmp::Shape float_input_shape(zynqmp::NC,
+                                    {cur_batch_size, batch_gate->dims()[1]});
+    float* float_data =
+        float_input.mutableData<float>(zynqmp::FP32, float_input_shape);
+    memcpy(float_data,
+           gru_value.gate_value,
+           batch_gate->dims()[1] * sizeof(float));
+    float_input.flush();
+
+    float* hidden_data =
+        hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
+
+    gru_tensors.gate = &float_input;
+    gru_tensors.output = &hidden_out;
+
+    pe_.GRUCOmpute(gru_tensors,
+                   frame_size,
+                   cur_batch_size,
+                   active_node,
+                   active_gate,
+                   param.origin_mode);
+
+    // TODO(chonwhite): copy data back to original tensor;
+
+    gru_tensors.pre_output = gru_tensors.output;
+  }
+  lite::arm::math::Batch2LoDTensorFunctor<float> to_seq;  // 5.
+  *(batch_hidden->mutable_lod()) = batch_gate->lod();
+  batch_hidden->mutable_data<float>();
+  to_seq(*batch_hidden, hidden);
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    gru, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::GRUCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("H0", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("BatchGate", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("BatchResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("BatchHidden", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/fpga/gru_compute.h b/lite/kernels/fpga/gru_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e3a31da0ba15d07a574805eaf7235cebeccf533
--- /dev/null
+++ b/lite/kernels/fpga/gru_compute.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+
+#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
+#include "lite/backends/fpga/KD/pes/gru_pe.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class GRUCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::GRUParam;
+
+  GRUCompute() = default;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~GRUCompute() = default;
+
+ private:
+  zynqmp::Tensor pre_output_;
+  zynqmp::Tensor pre_bias_;
+  zynqmp::Tensor weight_;
+
+  zynqmp::ElementwiseAddPE bias_ew_pe_;
+  zynqmp::FullyConnectedPE pre_out_pe_;
+  zynqmp::FullyConnectedPE reset_out_pe_;
+
+  zynqmp::GRUPE pe_;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/im2sequence_compute.cc b/lite/kernels/fpga/im2sequence_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27acd07f21bb187573df44ff8267c5279d720fed
--- /dev/null
+++ b/lite/kernels/fpga/im2sequence_compute.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/fpga/im2sequence_compute.h"
+
+#include "lite/backends/fpga/KD/float16.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void im2sequence(const float16* input,
+                 const int input_c,
+                 const int input_h,
+                 const int input_w,
+                 const int kernel_h,
+                 const int kernel_w,
+                 const int pad_top,
+                 const int pad_bottom,
+                 const int pad_left,
+                 const int pad_right,
+                 const int stride_h,
+                 const int stride_w,
+                 const int out_h,
+                 const int out_w,
+                 float16* out) {
+  int window_size = kernel_h * kernel_w;
+  int out_rows = out_h * out_w;
+  int out_cols = input_c * window_size;
+  int H_pad = input_h + pad_top + pad_bottom;
+  int W_pad = input_w + pad_left + pad_right;
+
+  float16 zero = zynqmp::float_to_half(0.0f);
+
+  for (int h_id = 0; h_id < out_h; h_id++) {
+    for (int w_id = 0; w_id < out_w; w_id++) {
+      // consider dilation.
+      int start_h = h_id * stride_h - pad_top;
+      int start_w = w_id * stride_w - pad_left;
+      for (int c_id = 0; c_id < input_c; c_id++) {
+        for (int k_h_id = 0; k_h_id < kernel_h; k_h_id++) {
+          int in_h_id = start_h + k_h_id;
+          bool exceed_flag = (in_h_id < 0) || (in_h_id >= H_pad);
+          int out_start_id =
+              (h_id * out_w + w_id) * out_cols + c_id * window_size;
+          for (int k_w_id = 0; k_w_id < kernel_w; k_w_id++) {
+            int in_w_id = start_w + k_w_id;
+            exceed_flag = exceed_flag || (in_w_id < 0) || (in_w_id >= W_pad);
+            int input_id = (c_id * input_h + in_h_id) * input_w + in_w_id;
+            int out_id = out_start_id + k_h_id * kernel_w + k_w_id;
+            out[out_id] = exceed_flag ? zero : input[input_id];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void hwc_to_chw(T* chw_data,
+                const T* hwc_data,
+                int num,
+                int channel,
+                int height,
+                int width) {
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int wh = width * height;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+
+void Im2SequenceCompute::PrepareForRun() {}
+
+void Im2SequenceCompute::Run() {
+  auto& param = this->Param<operators::Im2SequenceParam>();
+  auto kernels = param.kernels;
+  auto strides = param.strides;
+  auto paddings = param.paddings;
+
+  const auto* x_data = param.X->data<float16>();
+  float16* o_data =
+      reinterpret_cast<float16*>(param.Out->mutable_data<float16>());
+
+  float16* o2 = o_data;
+
+  auto input_dims = param.X->dims();
+  int im_num = input_dims[0];
+  int im_size = param.X->numel() / im_num;
+
+  param.X->ZynqTensor()->syncToCPU();
+  float16* chw_data = new float16[param.X->numel()];
+  hwc_to_chw<float16>(chw_data,
+                      x_data,
+                      param.X->dims()[0],
+                      param.X->dims()[1],
+                      param.X->dims()[2],
+                      param.X->dims()[3]);
+
+  const float16* in = chw_data;
+
+  int out_cols = input_dims[1] * kernels[0] * kernels[1];
+
+  int total_rows = 0;
+  std::vector<uint64_t> im_offset;
+  im_offset.push_back(total_rows);
+  if (param.Y) {
+    const auto* y_data = param.Y->data<int>();
+    auto out_strides = param.out_strides;
+    std::vector<int> im_real_h;
+    std::vector<int> im_real_w;
+    std::vector<int> out_h_vec;
+    std::vector<int> out_w_vec;
+    for (int im_id = 0; im_id < im_num; im_id++) {
+      int real_h = y_data[im_id * 2 + 0];
+      int real_w = y_data[im_id * 2 + 1];
+      int tmp_real_h = (real_h + out_strides[0] - 1) / out_strides[0];
+      int tmp_real_w = (real_w + out_strides[1] - 1) / out_strides[1];
+      im_real_h.push_back(tmp_real_h);
+      im_real_w.push_back(tmp_real_w);
+      int out_h =
+          (tmp_real_h + paddings[0] + paddings[1] - kernels[0]) / strides[0] +
+          1;
+      int out_w =
+          (tmp_real_w + paddings[2] + paddings[3] - kernels[1]) / strides[1] +
+          1;
+      out_h_vec.push_back(out_h);
+      out_w_vec.push_back(out_w);
+      total_rows += out_h * out_w;
+      im_offset.push_back(total_rows);
+    }
+    auto out_dims = param.Out->dims();
+    out_dims[0] = total_rows;
+    param.Out->Resize(out_dims);
+
+    int out_offset = 0;
+    for (int im_id = 0; im_id < im_num; im_id++) {
+      im2sequence(in + im_id * im_size,
+                  input_dims[1],
+                  input_dims[2],
+                  input_dims[3],
+                  param.kernels[0],
+                  param.kernels[1],
+                  param.paddings[0],
+                  param.paddings[1],
+                  param.paddings[2],
+                  param.paddings[3],
+                  param.strides[0],
+                  param.strides[1],
+                  out_h_vec[im_id],
+                  out_w_vec[im_id],
+                  o2 + im_offset[im_id] * out_cols);
+    }
+  } else {
+    int out_h =
+        (input_dims[2] + paddings[0] + paddings[1] - kernels[0]) / strides[0] +
+        1;
+    int out_w =
+        (input_dims[3] + paddings[2] + paddings[3] - kernels[1]) / strides[1] +
+        1;
+    for (int im_id = 0; im_id < im_num; im_id++) {
+      int out_size_per_im = out_h * out_w * out_cols;
+      im2sequence(in + im_id * im_size,
+                  input_dims[1],
+                  input_dims[2],
+                  input_dims[3],
+                  param.kernels[0],
+                  param.kernels[1],
+                  param.paddings[0],
+                  param.paddings[1],
+                  param.paddings[2],
+                  param.paddings[3],
+                  param.strides[0],
+                  param.strides[1],
+                  out_h,
+                  out_w,
+                  o2 + im_id * out_size_per_im);
+      im_offset.push_back(uint64_t((im_id + 1) * out_h * out_w));
+    }
+    auto lod = param.Out->mutable_lod();
+    lod->resize(1);
+    (*lod)[0] = im_offset;
+  }
+
+  delete[] chw_data;
+  param.Out->ZynqTensor()->flush();
+  param.Out->ZynqTensor()->copyScaleFrom(param.X->ZynqTensor());
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(im2sequence,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::Im2SequenceCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/im2sequence_compute.h b/lite/kernels/fpga/im2sequence_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..362f801b7d1477c9685d8d077aebc9c10e5000ea
--- /dev/null
+++ b/lite/kernels/fpga/im2sequence_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+// #include "lite/backends/arm/math/type_trans.h"
+#include "lite/core/kernel.h"
+#include "lite/operators/im2sequence_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class Im2SequenceCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::Im2SequenceParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  ~Im2SequenceCompute() {}
+
+ private:
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc
old mode 100644
new mode 100755
index df85a03894fd5d3bd2b2265a6bffa5a458aaffe5..4554c24e07de656b948826c2fa6f9526f61daaa6
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -45,7 +45,23 @@ class IoCopyHostToFpgaCompute
     auto& param = Param<operators::IoCopyParam>();
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kFPGA));
-    param.y->CopyDataFrom(*param.x);
+    param.y->mutable_data<float16>();
+    if (param.x->ZynqTensor()->aligned() &&
+        param.x->ZynqTensor()->shape().shouldAlign()) {
+      zynqmp::Tensor tempTensor;
+      tempTensor.mutableData<float16>(zynqmp::FP16,
+                                      param.x->ZynqTensor()->shape());
+      tempTensor.copyFrom(param.x->ZynqTensor());
+      tempTensor.setAligned(true);
+      tempTensor.unalignImage();
+      param.y->ZynqTensor()->copyFrom(&tempTensor);
+    } else {
+      param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+    }
+    param.y->ZynqTensor()->invalidate();
+    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+    auto out_lod = param.y->mutable_lod();
+    *out_lod = param.x->lod();
   }
 
   std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
@@ -81,9 +97,101 @@ class IoCopyFpgaToHostCompute
     auto& param = Param<operators::IoCopyParam>();
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kFPGA));
-    param.y->CopyDataFrom(*param.x);
+
+    param.y->mutable_data<float>();
+    param.y->ZynqTensor()->setDataType(zynqmp::FP32);
+    param.x->ZynqTensor()->syncToDevice();
+
+    if (param.x->ZynqTensor()->aligned() &&
+        param.x->ZynqTensor()->shape().shouldAlign()) {
+      zynqmp::Tensor tempTensor;
+      tempTensor.mutableData<float16>(zynqmp::FP16,
+                                      param.x->ZynqTensor()->shape());
+      tempTensor.copyFrom(param.x->ZynqTensor());
+      tempTensor.setAligned(true);
+      tempTensor.unalignImage();
+      param.y->ZynqTensor()->copyFrom(&tempTensor);
+    } else {
+      param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+    }
+    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+    param.y->ZynqTensor()->flush();
+    auto out_lod = param.y->mutable_lod();
+    *out_lod = param.x->lod();
+  }
+  std::string doc() const override { return "Copy IO from FPGA to HOST"; }
+};
+
+void hwc_to_chw(float* chw_data,
+                float* hwc_data,
+                int num,
+                int channel,
+                int height,
+                int width) {
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int wh = width * height;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
+          index++;
+        }
+      }
+    }
   }
+}
+
+class IoCopyFpgaToHostCHWCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override {
+    auto& param = Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kHost) ||
+          param.x->target() == TARGET(kFPGA));
+
+    Tensor hwc;
+    hwc.Resize(param.y->dims());
+    float* hwc_data = hwc.mutable_data<float>();
 
+    float* chw_data = param.y->mutable_data<float>();
+    param.y->ZynqTensor()->setDataType(zynqmp::FP32);
+    param.x->ZynqTensor()->syncToDevice();
+
+    if (param.x->ZynqTensor()->aligned() &&
+        param.x->ZynqTensor()->shape().shouldAlign()) {
+      zynqmp::Tensor tempTensor;
+      tempTensor.mutableData<float16>(zynqmp::FP16,
+                                      param.x->ZynqTensor()->shape());
+      tempTensor.copyFrom(param.x->ZynqTensor());
+      tempTensor.setAligned(true);
+      tempTensor.unalignImage();
+      hwc.ZynqTensor()->copyFrom(&tempTensor);
+    } else {
+      hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
+    }
+
+    int num = 1;
+    int channel = 1;
+    int height = 1;
+    int width = 1;
+
+    auto dims = param.y->ZynqTensor()->shape();
+
+    hwc_to_chw(chw_data,
+               hwc_data,
+               dims.num(),
+               dims.channel(),
+               dims.height(),
+               dims.width());
+
+    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+    param.y->ZynqTensor()->flush();
+    auto out_lod = param.y->mutable_lod();
+    *out_lod = param.x->lod();
+  }
   std::string doc() const override { return "Copy IO from FPGA to HOST"; }
 };
 
@@ -100,12 +208,27 @@ REGISTER_LITE_KERNEL(io_copy,
                      host_to_device)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNCHW))})
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(io_copy,
+                     kFPGA,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute,
+                     host_to_device_any_any)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(io_copy,
@@ -120,8 +243,24 @@ REGISTER_LITE_KERNEL(io_copy,
                                       DATALAYOUT(kNHWC))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kAny))})
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(io_copy,
+                     kFPGA,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::fpga::IoCopyFpgaToHostCHWCompute,
+                     device_to_host_chw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(io_copy_once,
@@ -132,12 +271,12 @@ REGISTER_LITE_KERNEL(io_copy_once,
                      host_to_device_once)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNCHW))})
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(io_copy_once,
@@ -148,8 +287,8 @@ REGISTER_LITE_KERNEL(io_copy_once,
                      device_to_host_once)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kHost),
                                        PRECISION(kAny),
diff --git a/lite/kernels/fpga/layout_compute.cc b/lite/kernels/fpga/layout_compute.cc
index 823637e3f3c92dd39b6110f1ed2028e2d06a1d31..c636e1c7200b5cf3a6f1366c53e2b26e2eea1637 100644
--- a/lite/kernels/fpga/layout_compute.cc
+++ b/lite/kernels/fpga/layout_compute.cc
@@ -26,16 +26,95 @@ namespace fpga {
 
 using float16 = zynqmp::float16;
 
-void TransHwcToChw(Tensor* dest, const Tensor* src) {}
-void TransChwToHwc(Tensor* dest, const Tensor* src) {}
+template <typename T>
+void convert_to_hwc(
+    T* chw_data, T* hwc_data, int num, int channel, int height, int width) {
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        for (int w = 0; w < width; w++) {
+          hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void hwc_to_chw(
+    T* chw_data, T* hwc_data, int num, int channel, int height, int width) {
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int wh = width * height;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+
+void TransHwcToChw(Tensor* dest, const Tensor* src) {
+  if (src->ZynqTensor()->dataType() == zynqmp::FP32) {
+    float* chw = dest->mutable_data<float>();
+    float* hwc = const_cast<float*>(src->data<float>());
+    int num = dest->dims()[0];
+    int channel = dest->dims()[1];
+    int height = 1;
+    if (dest->dims().size() > 2) {
+      height = dest->dims()[2];
+    }
+    int width = 1;
+    if (dest->dims().size() > 3) {
+      width = dest->dims()[3];
+    }
+
+    hwc_to_chw<float>(chw, hwc, num, channel, height, width);
+  }
+
+  if (src->ZynqTensor()->dataType() == zynqmp::FP16) {
+    float16* chw = dest->mutable_data<float16>();
+    float16* hwc = const_cast<float16*>(src->data<float16>());
+    int num = dest->dims()[0];
+    int channel = dest->dims()[1];
+    int height = 1;
+    if (dest->dims().size() > 2) {
+      height = dest->dims()[2];
+    }
+    int width = 1;
+    if (dest->dims().size() > 3) {
+      width = dest->dims()[3];
+    }
+
+    hwc_to_chw<float16>(chw, hwc, num, channel, height, width);
+  }
+}
+void TransChwToHwc(Tensor* dest, const Tensor* src) {
+  std::cout << "chw to hwc \n";
+  exit(-1);
+}
 
 class TransHwcToChwCompute
     : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kNHWC)> {
  public:
   void Run() override {
     auto& param = Param<operators::LayoutParam>();
-    auto out_data = param.y->mutable_data<float16>(TARGET(kFPGA));
+    param.x->ZynqTensor()->syncToCPU();
     TransHwcToChw(param.y, param.x);
+    param.y->ZynqTensor()->flush();
+    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+
+    auto out_lod = param.y->mutable_lod();
+    *out_lod = param.x->lod();
   }
 
   std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
@@ -97,6 +176,22 @@ REGISTER_LITE_KERNEL(layout,
                                        DATALAYOUT(kNCHW))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(layout,
+                     kFPGA,
+                     kAny,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::TransHwcToChwCompute,
+                     hwc_to_chw_arm_float)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(layout,
                      kFPGA,
                      kAny,
diff --git a/lite/kernels/fpga/mul_compute.cc b/lite/kernels/fpga/mul_compute.cc
new file mode 100755
index 0000000000000000000000000000000000000000..c27600d9f773ff0aae04a2ee519905bc0e58785c
--- /dev/null
+++ b/lite/kernels/fpga/mul_compute.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/mul_compute.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void MulCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  // ====================================================
+  zynqmp::FullyConnectedParam& fc_param = pe_.param();
+
+  param.output->mutable_data<float16>();
+
+  fc_param.input = param.x->ZynqTensor();
+  fc_param.output = param.output->ZynqTensor();
+  fc_param.filter = param.y->ZynqTensor();
+
+  fc_param.bias = &bias_;
+
+  int channel = fc_param.filter->shape().channel();
+
+  zynqmp::Shape bias_shape(zynqmp::N, {channel});
+
+  float* bias_data =
+      fc_param.bias->mutableData<float>(zynqmp::FP32, bias_shape);
+  memset(bias_data, 0, channel * sizeof(float));
+  bias_.flush();
+
+  pe_.init();
+  pe_.apply();
+}
+
+void mul(MulCompute* k) {
+  auto& param = k->Param<operators::MulParam>();
+  int num = param.x->dims()[0];
+  int channel = param.x->dims()[1];
+
+  int fn = param.y->dims()[1];
+
+  float16* out_data = param.output->mutable_data<float16>();
+  int g_index = 0;
+  for (int n = 0; n < 1; n++) {
+    for (int on = 0; on < fn; on++) {
+      float sum = 0;
+      int si = 0;
+      for (int c = 0; c < channel; c++) {
+        float value = zynqmp::half_to_float(param.x->data<float16>()[si]);
+        int index = c * fn + on;
+        float weight = param.y->data<float>()[index];
+        sum += value * weight;
+        si++;
+      }
+      out_data[g_index] = zynqmp::float_to_half(sum);
+      g_index++;
+    }
+  }
+}
+
+void MulCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::FullyConnectedParam& fc_param = pe_.param();
+  Debugger::get_instance().registerOutput("mul", fc_param.output);
+#endif
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    mul, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::MulCompute, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/xpu/graph_compute.h b/lite/kernels/fpga/mul_compute.h
similarity index 71%
rename from lite/kernels/xpu/graph_compute.h
rename to lite/kernels/fpga/mul_compute.h
index 5406daa8a1b757989d006f4e0ea09baedc809e33..3e5ad7838595d921c90f19faa75b67de94b59c60 100644
--- a/lite/kernels/xpu/graph_compute.h
+++ b/lite/kernels/fpga/mul_compute.h
@@ -13,35 +13,35 @@
 // limitations under the License.
 
 #pragma once
-
-#include <xtcl/xtcl.h>
-#include <memory>
-#include <string>
-#include <vector>
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/types.h"
 
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace xpu {
+namespace fpga {
 
-class GraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+class MulCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
  public:
-  using param_t = operators::GraphParam;
+  using param_t = operators::MulParam;
 
   void PrepareForRun() override;
 
   void Run() override;
 
-  virtual ~GraphCompute() = default;
+  virtual ~MulCompute() = default;
 
  private:
-  std::shared_ptr<xtcl::network::xRuntimeInstance> runtime_{nullptr};
+  zynqmp::FullyConnectedPE pe_;
+  zynqmp::Tensor bias_;
 };
 
-}  // namespace xpu
+}  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4834054df6371a9faaa17bd17b53a29b999ddf03
--- /dev/null
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
@@ -0,0 +1,431 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/multiclass_nms_compute.h"
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static void GetMaxScoreIndex(const std::vector<T>& scores,
+                             const T threshold,
+                             int top_k,
+                             std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(),
+                   sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static T JaccardOverlap(const T* box1, const T* box2, const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
+    T inter_w = inter_xmax - inter_xmin + norm;
+    T inter_h = inter_ymax - inter_ymin + norm;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <class T>
+T PolyIoU(const T* box1,
+          const T* box2,
+          const size_t box_size,
+          const bool normalized) {
+  LOG(FATAL) << "PolyIoU not implement.";
+}
+
+template <class T>
+void SliceOneClass(const Tensor& items,
+                   const int class_id,
+                   Tensor* one_class_item) {
+  T* item_data = one_class_item->mutable_data<T>();
+  const T* items_data = items.data<T>();
+  const int64_t num_item = items.dims()[0];
+  const int64_t class_num = items.dims()[1];
+  if (items.dims().size() == 3) {
+    int64_t item_size = items.dims()[2];
+    for (int i = 0; i < num_item; ++i) {
+      std::memcpy(item_data + i * item_size,
+                  items_data + i * class_num * item_size + class_id * item_size,
+                  sizeof(T) * item_size);
+    }
+  } else {
+    for (int i = 0; i < num_item; ++i) {
+      item_data[i] = items_data[i * class_num + class_id];
+    }
+  }
+}
+
+template <typename T>
+void NMSFast(const Tensor& bbox,
+             const Tensor& scores,
+             const T score_threshold,
+             const T nms_threshold,
+             const T eta,
+             const int64_t top_k,
+             std::vector<int>* selected_indices,
+             const bool normalized) {
+  // The total boxes for each instance.
+  int64_t num_boxes = bbox.dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
+  // 16, 24, or 32: [x1 y1 x2 y2 ...  xn yn], n = 8, 12 or 16
+  int64_t box_size = bbox.dims()[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+  const T* bbox_data = bbox.data<T>();
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = T(0.);
+        // 4: [xmin ymin xmax ymax]
+        if (box_size == 4) {
+          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size,
+                                      normalized);
+        }
+        // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
+        if (box_size == 8 || box_size == 16 || box_size == 24 ||
+            box_size == 32) {
+          overlap = PolyIoU<T>(bbox_data + idx * box_size,
+                               bbox_data + kept_idx * box_size,
+                               box_size,
+                               normalized);
+        }
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <typename T>
+void MultiClassNMS(const operators::MulticlassNmsParam& param,
+                   const Tensor& scores,
+                   const Tensor& bboxes,
+                   const int scores_size,
+                   std::map<int, std::vector<int>>* indices,
+                   int* num_nmsed_out) {
+  int64_t background_label = param.background_label;
+  int64_t nms_top_k = param.nms_top_k;
+  int64_t keep_top_k = param.keep_top_k;
+  bool normalized = param.normalized;
+  T nms_threshold = static_cast<T>(param.nms_threshold);
+  T nms_eta = static_cast<T>(param.nms_eta);
+  T score_threshold = static_cast<T>(param.score_threshold);
+
+  int num_det = 0;
+  int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
+
+  for (int64_t c = 0; c < class_num; ++c) {
+    Tensor bbox_slice, score_slice;
+    if (c == background_label) continue;
+    if (scores_size == 3) {
+      scores.Slice<T>(score_slice, c, c + 1);
+      bbox_slice = bboxes;
+    } else {
+      score_slice.Resize({scores.dims()[0], 1});
+      bbox_slice.Resize({scores.dims()[0], 4});
+      SliceOneClass<T>(scores, c, &score_slice);
+      SliceOneClass<T>(bboxes, c, &bbox_slice);
+    }
+    NMSFast(bboxes,
+            score_slice,
+            score_threshold,
+            nms_threshold,
+            nms_eta,
+            nms_top_k,
+            &((*indices)[c]),
+            normalized);
+    if (scores_size == 2) {
+      std::stable_sort((*indices)[c].begin(), (*indices)[c].end());
+    }
+    num_det += (*indices)[c].size();
+  }
+
+  *num_nmsed_out = num_det;
+  const T* scores_data = scores.data<T>();
+  if (keep_top_k > -1 && num_det > keep_top_k) {
+    Tensor score_slice;
+
+    const T* sdata;
+    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+    for (const auto& it : *indices) {
+      int label = it.first;
+      if (scores_size == 3) {
+        sdata = scores_data + label * scores.dims()[1];
+      } else {
+        score_slice.Resize({scores.dims()[0], 1});
+        SliceOneClass<T>(scores, label, &score_slice);
+        sdata = score_slice.data<T>();
+      }
+      const std::vector<int>& label_indices = it.second;
+      for (size_t j = 0; j < label_indices.size(); ++j) {
+        int idx = label_indices[j];
+        score_index_pairs.push_back(
+            std::make_pair(sdata[idx], std::make_pair(label, idx)));
+      }
+    }
+    // Keep top k results per image.
+    std::stable_sort(score_index_pairs.begin(),
+                     score_index_pairs.end(),
+                     SortScorePairDescend<std::pair<int, int>>);
+    score_index_pairs.resize(keep_top_k);
+
+    // Store the new indices.
+    std::map<int, std::vector<int>> new_indices;
+    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+      int label = score_index_pairs[j].second.first;
+      int idx = score_index_pairs[j].second.second;
+      new_indices[label].push_back(idx);
+    }
+    if (scores_size == 2) {
+      for (const auto& it : new_indices) {
+        int label = it.first;
+        std::stable_sort(new_indices[label].begin(), new_indices[label].end());
+      }
+    }
+    new_indices.swap(*indices);
+    *num_nmsed_out = keep_top_k;
+  }
+}
+
+template <typename T>
+void MultiClassOutput(const Tensor& scores,
+                      const Tensor& bboxes,
+                      const std::map<int, std::vector<int>>& selected_indices,
+                      const int scores_size,
+                      Tensor* outs) {
+  int64_t class_num = scores.dims()[1];
+  int64_t predict_dim = scores.dims()[1];
+  int64_t box_size = bboxes.dims()[1];
+  if (scores_size == 2) {
+    box_size = bboxes.dims()[2];
+  }
+  int64_t out_dim = box_size + 2;
+  auto* scores_data = scores.data<T>();
+  auto* bboxes_data = bboxes.data<T>();
+  auto* odata = outs->mutable_data<T>();
+  const T* sdata;
+  Tensor bbox;
+  bbox.Resize({scores.dims()[0], box_size});
+  int count = 0;
+  for (const auto& it : selected_indices) {
+    int label = it.first;
+    const std::vector<int>& indices = it.second;
+    if (scores_size == 2) {
+      SliceOneClass<T>(bboxes, label, &bbox);
+    } else {
+      sdata = scores_data + label * predict_dim;
+    }
+    for (size_t j = 0; j < indices.size(); ++j) {
+      int idx = indices[j];
+      odata[count * out_dim] = label;  // label
+      const T* bdata;
+      if (scores_size == 3) {
+        bdata = bboxes_data + idx * box_size;
+        odata[count * out_dim + 1] = sdata[idx];  // score
+      } else {
+        bdata = bbox.data<T>() + idx * box_size;
+        odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
+      }
+      // xmin, ymin, xmax, ymax or multi-points coordinates
+      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
+      count++;
+    }
+  }
+}
+
+void MulticlassNmsCompute::Run() {
+  auto& param = Param<operators::MulticlassNmsParam>();
+  auto* boxes = param.bboxes;
+  auto* scores = param.scores;
+  auto* outs = param.out;
+  outs->mutable_data<float>();
+
+  auto score_dims = scores->dims();
+  auto score_size = score_dims.size();
+
+  auto box_dims = boxes->dims();
+  int64_t box_dim = boxes->dims()[2];
+
+  std::vector<std::map<int, std::vector<int>>> all_indices;
+  std::vector<uint64_t> batch_starts = {0};
+  int64_t batch_size = score_dims[0];
+
+  int64_t out_dim = box_dim + 2;
+  int num_nmsed_out = 0;
+  Tensor boxes_slice, scores_slice;
+  int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+  for (int i = 0; i < n; ++i) {
+    if (score_size == 3) {
+      scores->Slice<float>(scores_slice, i, i + 1);
+      scores_slice.Resize({score_dims[1], score_dims[2]});
+      boxes->Slice<float>(boxes_slice, i, i + 1);
+      boxes_slice.Resize({score_dims[2], box_dim});
+    } else {
+      auto boxes_lod = boxes->lod().back();
+      scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]);
+      boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]);
+    }
+    std::map<int, std::vector<int>> indices;
+    MultiClassNMS<float>(
+        param, scores_slice, boxes_slice, score_size, &indices, &num_nmsed_out);
+    all_indices.push_back(indices);
+    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+  }
+
+  uint64_t num_kept = batch_starts.back();
+  if (num_kept == 0) {
+    outs->Resize({1, 1});
+    float* od = outs->mutable_data<float>();
+    od[0] = -1;
+    batch_starts = {0, 1};
+  } else {
+    outs->Resize({static_cast<int64_t>(num_kept), out_dim});
+    for (int i = 0; i < n; ++i) {
+      if (score_size == 3) {
+        scores->Slice<float>(scores_slice, i, i + 1);
+        boxes->Slice<float>(boxes_slice, i, i + 1);
+        scores_slice.Resize({score_dims[1], score_dims[2]});
+        boxes_slice.Resize({score_dims[2], box_dim});
+      } else {
+        auto boxes_lod = boxes->lod().back();
+        scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]);
+        boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]);
+      }
+      int64_t s = static_cast<int64_t>(batch_starts[i]);
+      int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
+
+      if (e > s) {
+        Tensor out;
+        outs->Slice<float>(out, s, e);
+        MultiClassOutput<float>(
+            scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
+        outs->ZynqTensor()->copyFrom(out.ZynqTensor());
+      }
+      outs->Resize({static_cast<int64_t>(e - s), out_dim});
+    }
+  }
+  LoD lod;
+  lod.emplace_back(batch_starts);
+  outs->set_lod(lod);
+
+#ifdef FPGA_PRINT_TENSOR
+  Debugger::get_instance().registerOutput("boxes", boxes->ZynqTensor());
+  Debugger::get_instance().registerOutput("scores", scores->ZynqTensor());
+  Debugger::get_instance().registerOutput("nms", outs->ZynqTensor());
+#endif
+}
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(multiclass_nms,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::MulticlassNmsCompute,
+                     def)
+    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(multiclass_nms,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::MulticlassNmsCompute,
+                     def2)
+    .BindInput("BBoxes",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Scores",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/fpga/multiclass_nms_compute.h b/lite/kernels/fpga/multiclass_nms_compute.h
new file mode 100755
index 0000000000000000000000000000000000000000..764e707eeeeed49b46a4cac1c613905d960f0cbb
--- /dev/null
+++ b/lite/kernels/fpga/multiclass_nms_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class MulticlassNmsCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override;
+
+  virtual ~MulticlassNmsCompute() = default;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/norm_compute.cc b/lite/kernels/fpga/norm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..752e2b2211f33fba4f1a0b7ae1ecd9367ccbe91b
--- /dev/null
+++ b/lite/kernels/fpga/norm_compute.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/norm_compute.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void NormCompute::PrepareForRun() {
+  auto& param = this->Param<operators::NormParam>();
+  param.Out->mutable_data<float16>();
+
+  zynqmp::NormParam& norm_param = pe_.param();
+  norm_param.input = param.X->ZynqTensor();
+  norm_param.output = param.Out->ZynqTensor();
+  norm_param.epsilon = param.epsilon;
+
+  pe_.init();
+  pe_.apply();
+}
+
+void NormCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::NormParam& norm_param = pe_.param();
+  Debugger::get_instance().registerOutput("norm", norm_param.output);
+#endif
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    norm, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::NormCompute, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Norm",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/norm_compute.h b/lite/kernels/fpga/norm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fe135e9eb17928cfa44d60377f36a7355ed5d71
--- /dev/null
+++ b/lite/kernels/fpga/norm_compute.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+// #include "lite/backends/arm/math/type_trans.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/norm_pe.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class NormCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::NormParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  ~NormCompute() {}
+
+ private:
+  zynqmp::NormPE pe_;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/pooling_compute.cc b/lite/kernels/fpga/pooling_compute.cc
old mode 100644
new mode 100755
index e4979f8e5762400f453e323f98a6b18ba17a0998..64e5d204336303ca0deca796f7145ac88c016330
--- a/lite/kernels/fpga/pooling_compute.cc
+++ b/lite/kernels/fpga/pooling_compute.cc
@@ -18,6 +18,8 @@
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 
+#include "lite/backends/fpga/KD/debugger.hpp"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -26,26 +28,34 @@ namespace fpga {
 using float16 = zynqmp::float16;
 
 void PoolCompute::PrepareForRun() {
-  zynqmp::PoolingParam& pool_param = pe_.param();
   auto& param = Param<operators::PoolParam>();
-
   param.output->mutable_data<float16>();
 
+  zynqmp::PoolingParam& pool_param = pe_.param();
   pool_param.input = param.x->ZynqTensor();
   pool_param.output = param.output->ZynqTensor();
-  pool_param.relu.enabled = false;
+
+  pool_param.activeParam.type = zynqmp::TYPE_RELU;
 
   pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX
                                                 : zynqmp::PoolingType::AVERAGE;
   pool_param.globalPooling = param.global_pooling;
   pool_param.kernelSize = param.ksize;
   pool_param.strides = param.strides;
-  pool_param.paddings = param.paddings;
+  int pad_h = (*param.paddings)[0];
+  int pad_w = (*param.paddings)[2];
+  pool_param.paddings = std::vector<int>({pad_h, pad_w});
   pe_.init();
   pe_.apply();
 }
 
-void PoolCompute::Run() { pe_.dispatch(); }
+void PoolCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::PoolingParam& pool_param = pe_.param();
+  Debugger::get_instance().registerOutput("pooling", pool_param.output);
+#endif
+}
 
 }  // namespace fpga
 }  // namespace kernels
diff --git a/lite/kernels/fpga/pooling_compute_test.cc b/lite/kernels/fpga/pooling_compute_test.cc
old mode 100644
new mode 100755
index 2309bf8fe4aa1c083e1662556ac49bf4357ab07a..9248289fe9353705e7a2d84831b9f3de5d8ee7d7
--- a/lite/kernels/fpga/pooling_compute_test.cc
+++ b/lite/kernels/fpga/pooling_compute_test.cc
@@ -46,7 +46,7 @@ std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
   if (param_->global_pooling) {
     ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
+      (*param_->paddings)[i] = 0;
       ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
@@ -59,7 +59,7 @@ std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
     for (size_t i = 0; i < param_->ksize.size(); ++i) {
       output_shape.push_back(PoolOutputSize(x_dims[i + 2],
                                             param_->ksize[i],
-                                            param_->paddings[i],
+                                            (*param_->paddings)[i],
                                             param_->strides[i],
                                             param_->ceil_mode));
     }
@@ -76,7 +76,7 @@ void pool_compute_ref(const operators::PoolParam& param) {
 
   std::vector<int> ksize = param.ksize;
   std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
+  std::vector<int> paddings = *param.paddings;
 
   std::string pooling_type = param.pooling_type;
   bool global_pooling = param.global_pooling;
@@ -103,7 +103,7 @@ void pool_compute_ref(const operators::PoolParam& param) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
 
   if (global_pooling == true) {
     for (int n = 0; n < in_n; ++n) {
@@ -230,7 +230,7 @@ TEST(pool_fpga, compute) {
                         }
                         param.global_pooling = global_pooling;
                         param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
+                        *param.paddings = {pad, pad, pad, pad};
                         param.exclusive = exclusive;
                         param.ceil_mode = ceil_mode;
                         param.adaptive = false;
diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afd14ccb4b4a9a4f1e93e1e38840035fb18186bb
--- /dev/null
+++ b/lite/kernels/fpga/prior_box_compute.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+#include "lite/kernels/fpga/prior_box_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>* output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior->push_back(ar);
+      if (flip) {
+        output_aspect_ratior->push_back(1.0f / ar);
+      }
+    }
+  }
+}
+
+void PriorBoxCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  bool is_flip = param.flip;
+  bool is_clip = param.clip;
+  std::vector<float> min_size = param.min_sizes;
+  std::vector<float> max_size = param.max_sizes;
+  std::vector<float> aspect_ratio = param.aspect_ratios;
+  std::vector<float> variance = param.variances_;
+  int img_w = param.img_w;
+  int img_h = param.img_h;
+  float step_w = param.step_w;
+  float step_h = param.step_h;
+  float offset = param.offset;
+  std::vector<float> aspect_ratios_vec;
+  ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
+  size_t prior_num = aspect_ratios_vec.size() * min_size.size();
+  prior_num += max_size.size();
+  std::vector<std::string> order = param.order;
+  bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order;
+
+  int win1 = param.input->dims()[3];
+  int hin1 = param.input->dims()[2];
+
+  DDim shape_out({hin1, win1, prior_num, 4});
+  param.boxes->Resize(shape_out);
+  param.variances->Resize(shape_out);
+
+  param.boxes->mutable_data<float>();
+  param.variances->mutable_data<float>();
+  zynqmp::PriorBoxParam& priobox_param = pe_.param();
+  priobox_param.input = param.input->ZynqTensor();
+  priobox_param.image = param.image->ZynqTensor();
+  priobox_param.outputBoxes = param.boxes->ZynqTensor();
+  priobox_param.outputVariances = param.variances->ZynqTensor();
+  priobox_param.minSizes = param.min_sizes;
+  priobox_param.maxSizes = param.max_sizes;
+  priobox_param.aspectRatios = param.aspect_ratios;
+  priobox_param.variances = param.variances_;
+  priobox_param.minMaxAspectRatiosOrder = min_max_aspect_ratios_order;
+  priobox_param.flip = param.flip;
+  priobox_param.clip = param.clip;
+  priobox_param.stepW = param.step_w;
+  priobox_param.stepH = param.step_h;
+  priobox_param.offset = param.offset;
+
+  pe_.init();
+  pe_.apply();
+}
+
+void PriorBoxCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::PriorBoxParam& priobox_param = pe_.param();
+  Debugger::get_instance().registerOutput("pb_boxes",
+                                          priobox_param.outputBoxes);
+  Debugger::get_instance().registerOutput("pb_variances",
+                                          priobox_param.outputVariances);
+#endif
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(prior_box,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::PriorBoxCompute,
+                     def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Image",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/fpga/prior_box_compute.h b/lite/kernels/fpga/prior_box_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..74e4c536470ad7c9b41e7bd284ea13eff4910bb7
--- /dev/null
+++ b/lite/kernels/fpga/prior_box_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/prior_box_pe.hpp"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class PriorBoxCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::PriorBoxParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~PriorBoxCompute() = default;
+
+ private:
+  zynqmp::PriorBoxPE pe_;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f72f18892c987e48fb3467372352f6ded98444ff
--- /dev/null
+++ b/lite/kernels/fpga/reshape_compute.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/reshape_compute.h"
+#include <vector>
+#include "lite/operators/reshape_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void ReshapeCompute::Run() {
+  auto& param = Param<operators::ReshapeParam>();
+  param.output->mutable_data<float16>();
+  auto x = param.x;
+  // auto actual_shape = param.actual_shape;
+  Tensor* actual_shape = nullptr;  // TODO(chonwhite) change it.
+  auto output = param.output;
+  bool inplace = param.inplace;
+  auto x_dims = x->dims();
+  auto output_dims = output->dims();
+  if (actual_shape) {
+    auto actual_shape_dims = actual_shape->dims();
+    auto* actual_shape_data = actual_shape->data<int>();
+    auto shape = std::vector<int>(
+        actual_shape_data, actual_shape_data + actual_shape_dims.production());
+    output_dims = lite::operators::ValidateShape(shape, x_dims);
+    output->Resize(output_dims);
+  }
+  if (inplace) {
+    output->ShareDataWith(*x);
+  } else {
+    output->CopyDataFrom(*x);
+  }
+
+  param.x->ZynqTensor()->saveToFile("reshape_in", true);
+  output->ZynqTensor()->saveToFile("reshape_out", true);
+
+  output->Resize(output_dims);
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(reshape,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ReshapeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(reshape2,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ReshapeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .BindOutput("XShape",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(flatten,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ReshapeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(flatten2,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ReshapeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .BindOutput("XShape",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/reshape_compute.h b/lite/kernels/fpga/reshape_compute.h
new file mode 100755
index 0000000000000000000000000000000000000000..cc5ed0b565c800e7c03698f716cbe6a304ef97de
--- /dev/null
+++ b/lite/kernels/fpga/reshape_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class ReshapeCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override;
+
+  virtual ~ReshapeCompute() = default;
+};
+
+class ReshapeComputeFpgaToHost
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override;
+
+  virtual ~ReshapeComputeFpgaToHost() = default;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/scale_compute.cc b/lite/kernels/fpga/scale_compute.cc
old mode 100644
new mode 100755
index 01f9a63ad48f0f371ead0e19042df3841993c372..991c73f2954b00c50107e5ec45ff338e99be6d75
--- a/lite/kernels/fpga/scale_compute.cc
+++ b/lite/kernels/fpga/scale_compute.cc
@@ -19,7 +19,37 @@ namespace lite {
 namespace kernels {
 namespace fpga {
 
-void ScaleCompute::Run() {}
+void ScaleCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  param.output->mutable_data<float16>();
+
+  zynqmp::ScaleParam& scale_param = pe_.param();
+
+  scale_param.input = param.x->ZynqTensor();
+  scale_param.output = param.output->ZynqTensor();
+
+  int channel = scale_param.input->shape().channel();
+  zynqmp::Tensor* scale = new zynqmp::Tensor();
+  zynqmp::Tensor* bias = new zynqmp::Tensor();
+  zynqmp::Shape shape(zynqmp::N, {channel});
+  float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
+  float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
+
+  float scale_value = param.scale;
+  float bias_value = param.bias_after_scale ? param.bias : 0;
+
+  for (int i = 0; i < channel; ++i) {
+    scale_data[i] = scale_value;
+    bias_data[i] = bias_value;
+  }
+  scale_param.scale = scale;
+  scale_param.bias = bias;
+
+  pe_.init();
+  pe_.apply();
+}
+
+void ScaleCompute::Run() { pe_.dispatch(); }
 
 }  // namespace fpga
 }  // namespace kernels
diff --git a/lite/kernels/fpga/scale_compute.h b/lite/kernels/fpga/scale_compute.h
old mode 100644
new mode 100755
index 45cd3528f8a99636e1e22d3cce76e774a6488530..217399db7255d57c68cbf1b5e3f29462b9ab8bfb
--- a/lite/kernels/fpga/scale_compute.h
+++ b/lite/kernels/fpga/scale_compute.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 
@@ -21,12 +23,20 @@ namespace lite {
 namespace kernels {
 namespace fpga {
 
+using float16 = zynqmp::float16;
+
 class ScaleCompute
     : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
  public:
+  using param_t = operators::ScaleParam;
+
+  void PrepareForRun() override;
   void Run() override;
 
   virtual ~ScaleCompute() = default;
+
+ private:
+  zynqmp::ScalePE pe_;
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/softmax_compute.cc b/lite/kernels/fpga/softmax_compute.cc
old mode 100644
new mode 100755
index 63abc76e68ebf15a458ed380d7eabeaf89d5dd2f..b13b5f0f46202dfcf59a5c9094a409d02f03e4a2
--- a/lite/kernels/fpga/softmax_compute.cc
+++ b/lite/kernels/fpga/softmax_compute.cc
@@ -33,7 +33,13 @@ void SoftmaxCompute::PrepareForRun() {
   pe_.apply();
 }
 
-void SoftmaxCompute::Run() { pe_.dispatch(); }
+void SoftmaxCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::SoftmaxParam& softmax_param = pe_.param();
+  Debugger::get_instance().registerOutput("softmax", softmax_param.output);
+#endif
+}
 
 }  // namespace fpga
 }  // namespace kernels
diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3bb813873d69d8f9d9939f06869e2640f416915
--- /dev/null
+++ b/lite/kernels/fpga/transpose_compute.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/fpga/transpose_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void transposeCompute(operators::TransposeParam param) {
+  // copy from;
+  const auto* input_x = param.x;
+  const auto input_x_dims = input_x->dims();
+  input_x->ZynqTensor()->invalidate();
+  input_x->ZynqTensor()->unalignImage();
+
+  Tensor float_input;
+  float_input.Resize(input_x_dims);
+  float_input.mutable_data<float>();
+  float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor());
+
+  const auto* input_x_data = float_input.data<float>();
+
+  auto* out = param.output;
+  const auto axis = param.axis;
+
+  auto* out_data = out->mutable_data<float>();
+
+  size_t ndim = axis.size();
+  std::vector<int> xdim(ndim);
+  std::vector<int> xstride(ndim);
+  std::vector<int> xout(ndim);
+  for (int i = 0; i < ndim; i++) {
+    int j = ndim - 1 - i;
+    xdim[j] = input_x_dims[axis[i]];
+    xstride[j] = 1;
+    for (int k = axis[i] + 1; k < ndim; k++) {
+      xstride[j] *= input_x_dims[k];
+    }
+    xout[j] = xstride[j] * xdim[j];
+  }
+
+  auto numel = input_x->numel();
+  size_t pind = 0;
+  std::vector<int> ind(ndim);
+  for (int i = 0; i < numel; i++) {
+    out_data[i] = input_x_data[pind];
+    ind[0]++;
+    pind += xstride[0];
+    for (int j = 0; j < ndim - 1; j++) {
+      if (ind[j] == xdim[j]) {
+        ind[j + 1]++;
+        ind[j] = 0;
+        pind += xstride[j + 1];
+        pind -= xout[j];
+      } else {
+        break;
+      }
+    }
+  }
+}
+
+// Transpose
+void TransposeCompute::Run() { auto& param = this->Param<param_t>(); }
+
+// Transpose2
+void Transpose2Compute::Run() {
+  auto& param = this->Param<param_t>();
+  param.output->mutable_data<float>();
+  param.x->ZynqTensor()->invalidate();
+  param.x->ZynqTensor()->unalignImage();
+  if (param.x->dims().size() != 4) {
+    transposeCompute(param);
+  } else {
+    param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+  }
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+// Transpose
+REGISTER_LITE_KERNEL(transpose,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::TransposeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+// Transpose2
+REGISTER_LITE_KERNEL(transpose2,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::Transpose2Compute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/fpga/transpose_compute.h b/lite/kernels/fpga/transpose_compute.h
new file mode 100755
index 0000000000000000000000000000000000000000..ba6a7e4aabbf27e545fc53ea9fd5d63ec2d1e6c0
--- /dev/null
+++ b/lite/kernels/fpga/transpose_compute.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/core/kernel.h"
+#include "lite/operators/transpose_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+// Transpose
+class TransposeCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void Run() override;
+
+  virtual ~TransposeCompute() = default;
+};
+
+// Transpose2
+class Transpose2Compute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void Run() override;
+
+  virtual ~Transpose2Compute() = default;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/multiclass_nms_compute.cc b/lite/kernels/host/multiclass_nms_compute.cc
index 6f6079ef88fd9e61dbacb35c0ca8bdac536288a9..9cbc798d46ecb3cf98159e9b4762c8692ec8c1eb 100644
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
@@ -271,7 +271,9 @@ void MultiClassOutput(const Tensor& scores,
                       const Tensor& bboxes,
                       const std::map<int, std::vector<int>>& selected_indices,
                       const int scores_size,
-                      Tensor* outs) {
+                      Tensor* outs,
+                      int* oindices = nullptr,
+                      const int offset = 0) {
   int64_t class_num = scores.dims()[1];
   int64_t predict_dim = scores.dims()[1];
   int64_t box_size = bboxes.dims()[1];
@@ -301,9 +303,15 @@ void MultiClassOutput(const Tensor& scores,
       if (scores_size == 3) {
         bdata = bboxes_data + idx * box_size;
         odata[count * out_dim + 1] = sdata[idx];  // score
+        if (oindices != nullptr) {
+          oindices[count] = offset + idx;
+        }
       } else {
         bdata = bbox.data<T>() + idx * box_size;
         odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
+        if (oindices != nullptr) {
+          oindices[count] = offset + idx * class_num + label;
+        }
       }
       // xmin, ymin, xmax, ymax or multi-points coordinates
       std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
@@ -317,7 +325,8 @@ void MulticlassNmsCompute::Run() {
   auto* boxes = param.bboxes;
   auto* scores = param.scores;
   auto* outs = param.out;
-
+  bool return_index = param.index ? true : false;
+  auto* index = param.index;
   auto score_dims = scores->dims();
   auto score_size = score_dims.size();
 
@@ -349,36 +358,61 @@ void MulticlassNmsCompute::Run() {
 
   uint64_t num_kept = batch_starts.back();
   if (num_kept == 0) {
-    outs->Resize({1, 1});
-    float* od = outs->mutable_data<float>();
-    od[0] = -1;
-    batch_starts = {0, 1};
+    if (return_index) {
+      outs->Resize({0, out_dim});
+      index->Resize({0, 1});
+    } else {
+      outs->Resize({1, 1});
+      float* od = outs->mutable_data<float>();
+      od[0] = -1;
+      batch_starts = {0, 1};
+    }
   } else {
     outs->Resize({static_cast<int64_t>(num_kept), out_dim});
+    int offset = 0;
+    int* oindices = nullptr;
     for (int i = 0; i < n; ++i) {
       if (score_size == 3) {
         scores_slice = scores->Slice<float>(i, i + 1);
         boxes_slice = boxes->Slice<float>(i, i + 1);
         scores_slice.Resize({score_dims[1], score_dims[2]});
         boxes_slice.Resize({score_dims[2], box_dim});
+        if (return_index) {
+          offset = i * score_dims[2];
+        }
       } else {
         auto boxes_lod = boxes->lod().back();
         scores_slice = scores->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
         boxes_slice = boxes->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+        if (return_index) {
+          offset = boxes_lod[i] * score_dims[1];
+        }
       }
       int64_t s = static_cast<int64_t>(batch_starts[i]);
       int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
       if (e > s) {
         Tensor out = outs->Slice<float>(s, e);
-        MultiClassOutput<float>(
-            scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
+        if (return_index) {
+          index->Resize({static_cast<int64_t>(num_kept), 1});
+          int* output_idx = index->mutable_data<int>();
+          oindices = output_idx + s;
+        }
+        MultiClassOutput<float>(scores_slice,
+                                boxes_slice,
+                                all_indices[i],
+                                score_dims.size(),
+                                &out,
+                                oindices,
+                                offset);
       }
     }
   }
 
   LoD lod;
   lod.emplace_back(batch_starts);
-
+  if (return_index) {
+    index->set_lod(lod);
+  }
   outs->set_lod(lod);
 }
 }  // namespace host
@@ -395,4 +429,6 @@ REGISTER_LITE_KERNEL(multiclass_nms,
     .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Index",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
     .Finalize();
diff --git a/lite/kernels/npu/CMakeLists.txt b/lite/kernels/npu/CMakeLists.txt
index eb1824e1112beec57d93d63a2464fed94fab81c9..e7624907cace3d9a2f77f616e58e15ad0876322c 100644
--- a/lite/kernels/npu/CMakeLists.txt
+++ b/lite/kernels/npu/CMakeLists.txt
@@ -1,13 +1,3 @@
+add_subdirectory(bridges)
 
-if(NOT LITE_WITH_NPU)
-  return ()
-endif()
-
-message(STATUS "compile with lite NPU kernels")
-
-add_kernel(graph_compute_npu NPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} npu_runtime)
-# lite_cc_test(test_graph_compute_npu SRCS graph_compute_test.cc DEPS graph_compute_npu)
-
-if(NOT LITE_ON_TINY_PUBLISH)
-    add_subdirectory(bridges)
-endif()
+add_kernel(subgraph_compute_npu NPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_npu subgraph_bridge_engine ${npu_subgraph_bridges})
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index 032de819743f4aba02e442dd71c26b950d1435b6..1dd5ff60ccddaf1f2f35ae59d84f432a564c9443 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -1,64 +1,80 @@
-lite_cc_library(npu_bridge_registry SRCS registry.cc)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM)
+  return()
+endif()
 
-set(npu_bridge_deps npu_bridge_registry npu_builder op)
+lite_cc_library(subgraph_bridge_registry
+    SRCS registry.cc
+    DEPS op)
+lite_cc_library(subgraph_bridge_engine
+    SRCS engine.cc
+    DEPS tensor op scope program)
 
-lite_cc_library(npu_bridge_fc_op SRCS fc_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_conv_op SRCS conv_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_mul_op SRCS mul_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_act_op SRCS act_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_scale_op SRCS scale_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_softmax_op SRCS softmax_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_pool_op SRCS pool_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_batch_norm_op SRCS batch_norm_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_elementwise_ops SRCS elementwise_ops.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_reshape_op SRCS reshape_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_conv_transpose_op SRCS conv_transpose_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_interpolate_op SRCS interpolate_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_transpose_op SRCS transpose_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_split_op SRCS split_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_concat_op SRCS concat_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps})
+if(NOT LITE_WITH_NPU)
+  return()
+endif()
 
-set(npu_bridges
-        npu_bridge_registry
-        npu_bridge_fc_op
-        npu_bridge_conv_op
-        npu_bridge_mul_op
-        npu_bridge_act_op
-        npu_bridge_scale_op
-        npu_bridge_softmax_op
-        npu_bridge_pool_op
-        npu_bridge_batch_norm_op
-        npu_bridge_elementwise_ops
-        npu_bridge_reshape_op
-        npu_bridge_conv_transpose_op
-        npu_bridge_interpolate_op
-        npu_bridge_transpose_op
-        npu_bridge_split_op
-        npu_bridge_concat_op
-        npu_bridge_shuffle_channel_op
-        npu_bridge_pad2d_op
-        CACHE INTERNAL "npu_bridges")
+lite_cc_library(subgraph_bridge_utility_npu SRCS utility.cc DEPS ${npu_builder_libs} tensor)
+lite_cc_library(subgraph_bridge_graph_npu SRCS graph.cc DEPS subgraph_bridge_utility_npu)
 
-set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops})
+set(npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_npu subgraph_bridge_graph_npu)
 
-lite_cc_test(test_npu_bridge_fc_op SRCS fc_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_mul_op SRCS mul_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_act_op SRCS act_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_scale_op SRCS scale_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_softmax_op SRCS softmax_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_pool_op SRCS pool_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_batch_norm_op SRCS batch_norm_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_elementwise_ops SRCS elementwise_ops_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_reshape_op SRCS reshape_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_conv_transpose_op SRCS conv_transpose_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_interpolate_op SRCS interpolate_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_transpose_op SRCS transpose_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_split_op SRCS split_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_concat_op SRCS concat_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_fc_op_npu SRCS fc_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_op_npu SRCS conv_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_matmul_op_npu SRCS matmul_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_mul_op_npu SRCS mul_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_act_op_npu SRCS act_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_scale_op_npu SRCS scale_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_softmax_op_npu SRCS softmax_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_npu SRCS pool_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_npu SRCS batch_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_npu SRCS elementwise_ops.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_reshape_op_npu SRCS reshape_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_transpose_op_npu SRCS conv_transpose_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_interpolate_op_npu SRCS interpolate_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_transpose_op_npu SRCS transpose_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_split_op_npu SRCS split_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_npu SRCS concat_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_shuffle_channel_op_npu SRCS shuffle_channel_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pad2d_op_npu SRCS pad2d_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_square_op_npu SRCS square_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_sqrt_op_npu SRCS sqrt_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_reduce_mean_op_npu SRCS reduce_mean_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_instance_norm_op_npu SRCS instance_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_dropout_op_npu SRCS dropout_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_layer_norm_op_npu SRCS layer_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
 
-message(STATUS "+++++ npu_bridges: ${npu_bridges}")
+set(npu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_npu
+        subgraph_bridge_graph_npu
+        subgraph_bridge_fc_op_npu
+        subgraph_bridge_conv_op_npu
+        subgraph_bridge_matmul_op_npu
+        subgraph_bridge_mul_op_npu
+        subgraph_bridge_act_op_npu
+        subgraph_bridge_scale_op_npu
+        subgraph_bridge_softmax_op_npu
+        subgraph_bridge_pool_op_npu
+        subgraph_bridge_batch_norm_op_npu
+        subgraph_bridge_elementwise_ops_npu
+        subgraph_bridge_reshape_op_npu
+        subgraph_bridge_conv_transpose_op_npu
+        subgraph_bridge_interpolate_op_npu
+        subgraph_bridge_transpose_op_npu
+        subgraph_bridge_split_op_npu
+        subgraph_bridge_concat_op_npu
+        subgraph_bridge_shuffle_channel_op_npu
+        subgraph_bridge_pad2d_op_npu
+        subgraph_bridge_square_op_npu
+        subgraph_bridge_sqrt_op_npu
+        subgraph_bridge_reduce_mean_op_npu
+        subgraph_bridge_unsqueeze_op_npu
+        subgraph_bridge_argmax_op_npu
+        subgraph_bridge_instance_norm_op_npu
+        subgraph_bridge_dropout_op_npu
+        subgraph_bridge_layer_norm_op_npu
+        CACHE INTERNAL "npu_subgraph_bridges")
+
+message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}")
diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc
index 51b49091cd0e6f47fb9367e13aa7b2e43a6cf610..a4d1009f1be286e8bd8dfcdd469ff53b6681c820 100644
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
@@ -12,54 +12,95 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
-                           const node_map_type& inputs_map) {
-  auto scope = act_op->scope();
-  auto op_info = act_op->op_info();
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  // create act node and set input node from inputs_map
-  auto x_var_name = op_info->Input("X").front();
-  auto act_node = std::make_shared<ge::op::Activation>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  act_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(act_node);
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Act node
+  auto act_node = graph->Add<ge::op::Activation>(out_name);
+  auto act_op = act_node->data<ge::op::Activation>();
+  act_op->set_input_x(*x_node->data());
   // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
   // clipped_relu etc.
-  act_node->set_attr_mode(lite::npu::CvtActMode(op_type));
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = act_node;
-  return outputs_map;
+  act_op->set_attr_mode(CvtActMode(op_type));
+  if (op_type == "relu_clipped") {
+    auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
+    act_op->set_attr_coef(Relu_clipped_coef);
+  } else if (op_type == "relu6") {
+    float Relu_clipped_coef = 6.f;
+    act_op->set_attr_coef(Relu_clipped_coef);
+  } else if (op_type == "leaky_relu") {
+    auto alpha = op_info->GetAttr<float>("alpha");
+    act_op->set_attr_negative_slope(alpha);
+  } else if (op_type == "hard_sigmoid") {
+    auto slope = op_info->GetAttr<float>("slope");
+    auto offset = op_info->GetAttr<float>("offset");
+    act_op->set_attr_negative_slope(slope);
+    act_op->set_attr_coef(offset);
+  }
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(sigmod, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(relu, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(tanh, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(abs, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(softsign,
-                    paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(softplus,
-                    paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(hardsigmoid,
-                    paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(sigmoid,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(relu, kNPU, paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(tanh, kNPU, paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(relu_clipped,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(relu6,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(abs, kNPU, paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(softsign,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(softplus,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(hard_sigmoid,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ActConverter);
diff --git a/lite/kernels/npu/bridges/argmax_op.cc b/lite/kernels/npu/bridges/argmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d397aab9d5cc7cfb800198184d656856d8c101f
--- /dev/null
+++ b/lite/kernels/npu/bridges/argmax_op.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  int axis = op_info->GetAttr<int64_t>("axis");
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Axis node
+  auto axis_node = graph->Add(out_name + "/axis", axis);
+
+  // Argmax node
+  auto argmax_node = graph->Add<ge::op::ArgMax>(out_name);
+  auto argmax_op = argmax_node->data<ge::op::ArgMax>();
+  argmax_op->set_input_x1(*x_node->data());
+  argmax_op->set_input_x2(*axis_node->data());
+  return SUCCESS;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(arg_max,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ArgmaxConverter);
diff --git a/lite/kernels/npu/bridges/argmax_op_test.cc b/lite/kernels/npu/bridges/argmax_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c2de79edbcdd1c1efb2d513bba17ed198aa64966
--- /dev/null
+++ b/lite/kernels/npu/bridges/argmax_op_test.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/argmax_op.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+template <typename dtype>
+void argmax_ref(const std::shared_ptr<operators::ArgmaxOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out_ref");
+  int axis = op_info->GetAttr<int64_t>("axis");
+  auto x_dims = x->dims();
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+  auto y_shape = x_dims.Vectorize();
+  y_shape.erase(y_shape.begin() + axis);
+  out->Resize(y_shape);
+  auto out_dims = out->dims();
+
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+
+  const int size = x_dims[axis];
+  const int in_channel = x_dims.count(axis, x_dims.size());
+  const int out_channel = out_dims.count(axis, out_dims.size());
+  const int in_stride = x_dims.count(axis + 1, x_dims.size());
+  const int out_stride = x_dims.count(0, axis);
+
+  for (int n = 0; n < out_stride; n++) {
+    for (int k = 0; k < in_stride; k++) {
+      const float* in_ptr = x_data + n * in_channel + k;
+      std::vector<std::pair<float, int>> vec;
+      vec.resize(size);
+      for (int i = 0; i < size; i++) {
+        vec[i] = std::make_pair(in_ptr[i * in_stride], i);
+      }
+      // sort
+      std::partial_sort(vec.begin(),
+                        vec.begin() + 1,
+                        vec.end(),
+                        std::greater<std::pair<float, int>>());
+
+      // out
+      dtype* out_ptr = out_data + n * out_channel + k;
+      *out_ptr = vec[0].second;
+    }
+  }
+}
+
+void test_argmax(const std::vector<int64_t>& input_shape, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("arg_max");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", static_cast<int64_t>(axis));
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ArgmaxOpLite>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  argmax_ref<float>(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<int>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, argmax) {
+  test_argmax({1, 2, 3, 4}, 1);
+  test_argmax({1, 2, 3, 4}, 2);
+  test_argmax({1, 2, 3, 4}, 3);
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(arg_max);
+USE_NPU_BRIDGE(arg_max);
diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc
index 6f5f00959bd55faee2a76aa0bfbb9f12fa84c194..d0e97161c5f1bc6b126e81b969a4564b47da9331 100644
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
@@ -12,81 +12,98 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type BatchNormConverter(
-    const std::shared_ptr<lite::OpLite> batch_norm_op,
-    const node_map_type& inputs_map) {
-  auto scope = batch_norm_op->scope();
-  auto op_info = batch_norm_op->op_info();
+int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::BatchNorm> batch_norm_node =
-      std::make_shared<ge::op::BatchNorm>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto scale_name = op_info->Input("Scale").front();
+  auto scale_type = kernel->GetInputDeclType("Scale");
+  CHECK(scale_type->precision() == PRECISION(kFloat));
+  CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
+  auto scale = scope->FindMutableTensor(scale_name);
+  auto bias_name = op_info->Input("Bias").front();
+  auto bias_type = kernel->GetInputDeclType("Bias");
+  CHECK(bias_type->precision() == PRECISION(kFloat));
+  CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+  auto bias = scope->FindMutableTensor(bias_name);
+  auto mean_name = op_info->Input("Mean").front();
+  auto mean_type = kernel->GetInputDeclType("Mean");
+  CHECK(mean_type->precision() == PRECISION(kFloat));
+  CHECK(mean_type->layout() == DATALAYOUT(kNCHW));
+  auto mean = scope->FindMutableTensor(mean_name);
+  auto variance_name = op_info->Input("Variance").front();
+  auto variance_type = kernel->GetInputDeclType("Variance");
+  CHECK(variance_type->precision() == PRECISION(kFloat));
+  CHECK(variance_type->layout() == DATALAYOUT(kNCHW));
+  auto variance = scope->FindMutableTensor(variance_name);
+  auto y_name = op_info->Output("Y").front();
+  auto y_type = kernel->GetOutputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  float momentum = op_info->GetAttr<float>("momentum");
+  float epsilon = op_info->GetAttr<float>("epsilon");
+  int mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
+  bool use_global_stats = !op_info->HasAttr("use_global_stats") ||
+                          op_info->GetAttr<bool>("use_global_stats");
+  if (!use_global_stats) {
+    LOG(WARNING) << "[NPU] Only use_global_stats=true is supported by HiAI DDK";
+  }
 
-  auto scale_var_name = op_info->Input("Scale").front();
-  lite::Tensor* scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
-  auto npu_scale = std::make_shared<ge::op::Const>(scale_var_name);
-  npu_scale->set_attr_value(lite::npu::CvtTensor(scale));
-  lite::npu::OpList::Global().add(npu_scale);
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
 
-  auto bias_var_name = op_info->Input("Bias").front();
-  lite::Tensor* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
-  auto npu_bias = std::make_shared<ge::op::Const>(bias_var_name);
-  npu_bias->set_attr_value(lite::npu::CvtTensor(bias));
-  lite::npu::OpList::Global().add(npu_bias);
+  // Scale, Bias, Mean, Variance node
+  auto scale_node = graph->Add(scale_name, *scale);
+  auto bias_node = graph->Add(bias_name, *bias);
+  auto mean_node = graph->Add(mean_name, *mean);
+  auto variance_node = graph->Add(variance_name, *variance);
 
-  auto mean_var_name = op_info->Input("Mean").front();
-  lite::Tensor* mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
-  auto npu_mean = std::make_shared<ge::op::Const>(mean_var_name);
-  npu_mean->set_attr_value(lite::npu::CvtTensor(mean));
-  lite::npu::OpList::Global().add(npu_mean);
-
-  auto variance_var_name = op_info->Input("Variance").front();
-  lite::Tensor* variance =
-      scope->FindVar(variance_var_name)->GetMutable<Tensor>();
-  auto npu_variance = std::make_shared<ge::op::Const>(variance_var_name);
-  npu_variance->set_attr_value(lite::npu::CvtTensor(variance));
-  lite::npu::OpList::Global().add(npu_variance);
-
-  float npu_momentum = op_info->GetAttr<float>("momentum");
-  float npu_epsilon = op_info->GetAttr<float>("epsilon");
-  int npu_mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
-  bool npu_use_global_stats = op_info->GetAttr<bool>("use_global_stats");
-
-  batch_norm_node->set_input_x(*inputs_map.at(x_var_name));
-  batch_norm_node->set_input_scale(*npu_scale);
-  batch_norm_node->set_input_b(*npu_bias);
-  batch_norm_node->set_input_mean(*npu_mean);
-  batch_norm_node->set_input_variance(*npu_variance);
-  batch_norm_node->set_attr_momentum(npu_momentum);
-  batch_norm_node->set_attr_epsilon(npu_epsilon);
-  batch_norm_node->set_attr_mode(npu_mode);
-  batch_norm_node->set_attr_use_global_stats(npu_use_global_stats);
-
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(batch_norm_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Y").front()] = batch_norm_node;
-  return outputs_map;
+  // Batch Norm node
+  auto batch_norm_node = graph->Add<ge::op::BatchNormExt2>(y_name);
+  auto batch_norm_op = batch_norm_node->data<ge::op::BatchNormExt2>();
+  batch_norm_op->set_input_x(*x_node->data());
+  batch_norm_op->set_input_scale(*scale_node->data());
+  batch_norm_op->set_input_offset(*bias_node->data());
+  batch_norm_op->set_input_mean(*mean_node->data());
+  batch_norm_op->set_input_variance(*variance_node->data());
+  batch_norm_op->set_attr_momentum(momentum);
+  batch_norm_op->set_attr_epsilon(epsilon);
+  batch_norm_op->set_attr_mode(mode);
+  batch_norm_op->set_attr_use_global_stats(use_global_stats);
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(batch_norm,
-                    paddle::lite::kernels::npu::bridges::BatchNormConverter);
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kNPU,
+                         paddle::lite::subgraph::npu::BatchNormConverter);
diff --git a/lite/kernels/npu/bridges/batch_norm_op_test.cc b/lite/kernels/npu/bridges/batch_norm_op_test.cc
deleted file mode 100644
index 38a876efb7c8ca6c38dee44e3c7a29a141d995d4..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/batch_norm_op_test.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/batch_norm_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename dtype>
-void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable<Tensor>();
-  auto bias =
-      scope->FindVar(op_info->Input("Bias").front())->GetMutable<Tensor>();
-  auto scale =
-      scope->FindVar(op_info->Input("Scale").front())->GetMutable<Tensor>();
-  auto mean =
-      scope->FindVar(op_info->Input("Mean").front())->GetMutable<Tensor>();
-  auto variance =
-      scope->FindVar(op_info->Input("Variance").front())->GetMutable<Tensor>();
-
-  auto x_data = x->data<dtype>();
-  auto y_data = y->mutable_data<dtype>();
-  auto scale_data = scale->mutable_data<dtype>();
-  auto bias_data = bias->mutable_data<dtype>();
-  auto mean_data = mean->mutable_data<dtype>();
-  auto variance_data = variance->mutable_data<dtype>();
-  DDim x_dims = x->dims();
-
-  float epsilon = op_info->GetAttr<float>("epsilon");
-  float momentum = op_info->GetAttr<float>("momentum");
-  auto data_layout = op_info->GetAttr<std::string>("data_layout");
-
-  bool global_stats = op_info->GetAttr<bool>("use_global_stats");
-  if (global_stats) {
-    int64_t outer_size = 0;
-    int64_t channel_size = 0;
-    int64_t inner_size = 0;
-    if (data_layout == "NCHW") {
-      outer_size = x_dims[0];
-      channel_size = x_dims[1];
-      inner_size = x_dims.Slice(2, x_dims.size()).production();
-    } else {
-      LOG(FATAL) << "Unknown storage order: " << data_layout;
-    }
-    auto x_ptr = x_data;
-    auto y_ptr = y_data;
-    for (int o = 0; o < outer_size; o++) {
-      for (int c = 0; c < channel_size; c++) {
-        for (int i = 0; i < inner_size; i++) {
-          dtype norm_x =
-              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
-          *y_ptr = norm_x * scale_data[c] + bias_data[c];
-          x_ptr++;
-          y_ptr++;
-        }
-      }
-    }
-  }
-}
-
-void test_batch_norm(
-    int bs, int ic, int ih, int iw, float epsilon, float momentum) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  std::string scale_var_name = "scale";
-  std::string bias_var_name = "bias";
-  std::string mean_var_name = "mean";
-  std::string variance_var_name = "variance";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* scale = scope.Var(scale_var_name)->GetMutable<Tensor>();
-  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-  auto* mean = scope.Var(mean_var_name)->GetMutable<Tensor>();
-  auto* variance = scope.Var(variance_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  scale->Resize({ic});
-  bias->Resize({ic});
-  mean->Resize({ic});
-  variance->Resize({ic});
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-  FillTensor<float, int>(scale);
-  FillTensor<float, int>(bias);
-  FillTensor<float, int>(mean);
-  // variance > 0
-  FillTensor<float, int>(variance, 1.f, 5.f);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("batch_norm");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetInput("Scale", {scale_var_name});
-  opdesc.SetInput("Bias", {bias_var_name});
-  opdesc.SetInput("Mean", {mean_var_name});
-  opdesc.SetInput("Variance", {variance_var_name});
-  opdesc.SetOutput("Y", {out_var_name});
-  opdesc.SetAttr("is_test", 1);
-  opdesc.SetAttr("use_global_stats", true);
-  opdesc.SetAttr("epsilon", epsilon);
-  opdesc.SetAttr("momentum", momentum);
-  opdesc.SetAttr("data_layout", std::string("NCHW"));
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::BatchNormOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  batch_norm_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, batch_norm) {
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto epsilon : {1e-4f, 1e-5f}) {
-            for (auto momentum : {0.9f, 0.99f}) {
-              test_batch_norm(bs, ic, ih, iw, epsilon, momentum);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(batch_norm);
-USE_NPU_BRIDGE(batch_norm);
diff --git a/lite/kernels/npu/bridges/concat_op.cc b/lite/kernels/npu/bridges/concat_op.cc
index 39d9bd697a39f2705de4de8a0a7ee42f7aa4263d..e40af8703dd1dda7303f0976fa03abec7cdf7aaa 100644
--- a/lite/kernels/npu/bridges/concat_op.cc
+++ b/lite/kernels/npu/bridges/concat_op.cc
@@ -12,58 +12,64 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ConcatConverter(const std::shared_ptr<lite::OpLite> concat_op,
-                              const node_map_type& inputs_map) {
-  lite::Scope* scope = concat_op->scope();
-  const lite::OpInfo* op_info = concat_op->op_info();
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " << op_type << " ... ";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " << op_type << " ... ";
 
-  auto x_var_names = op_info->Input("X");
+  // Get input and output vars and op attributes
+  auto x_names = op_info->Input("X");
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto axis = op_info->GetAttr<int>("axis");
-  int num = x_var_names.size();
-  int index = 0;
+  auto num = x_names.size();
 
-  std::shared_ptr<ge::op::Concat> output_node =
-      std::make_shared<ge::op::Concat>(unique_op_type);
-  output_node->set_attr_axis(axis);
-  output_node->set_attr_N(num);
-  output_node->create_dynamic_input_x(num);
-  for (auto x_var_name : x_var_names) {
-    if (inputs_map.find(x_var_name) != inputs_map.end()) {
-      output_node->set_dynamic_input_x(index + 1, *inputs_map.at(x_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  // Traverse all of input nodes which are added into the new created concat
+  // node
+  auto concat_node = graph->Add<ge::op::Concat>(out_name);
+  auto concat_op = concat_node->data<ge::op::Concat>();
+  concat_op->set_attr_axis(axis);
+  concat_op->set_attr_N(num);
+  concat_op->create_dynamic_input_x(num);
+  int idx = 1;
+  for (auto& x_name : x_names) {
+    auto x = scope->FindMutableTensor(x_name);
+    auto x_dims = x->dims();
+    std::shared_ptr<Node> x_node = nullptr;
+    if (graph->Has(x_name)) {
+      x_node = graph->Get(x_name);
     } else {
-      auto consty = std::make_shared<ge::op::Const>(x_var_name);
-      auto* x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
-      consty->set_attr_value(lite::npu::CvtTensor(x));
-      output_node->set_dynamic_input_x(index + 1, *consty);
-      lite::npu::OpList::Global().add(consty);
+      x_node = graph->Add(x_name, *x);
     }
-    index++;
+    concat_op->set_dynamic_input_x(idx, *x_node->data());
+    idx++;
   }
-  lite::npu::OpList::Global().add(output_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
-  return outputs_map;
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(concat,
-                    paddle::lite::kernels::npu::bridges::ConcatConverter);
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ConcatConverter);
diff --git a/lite/kernels/npu/bridges/concat_op_test.cc b/lite/kernels/npu/bridges/concat_op_test.cc
deleted file mode 100644
index f870bb0e7e2c0e7d854d152a0067bf657c19ada7..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/concat_op_test.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/concat_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-std::vector<size_t> stride_numel(const DDim& ddim) {
-  std::vector<size_t> strides(ddim.size());
-  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i];
-  }
-  return strides;
-}
-
-void concat_ref(const std::shared_ptr<operators::ConcatOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = op_info->Input("X");
-  std::vector<lite::Tensor*> inputs;
-  for (auto var : x) {
-    inputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  int axis = op_info->GetAttr<int>("axis");
-  std::vector<lite::Tensor*> inputs_concat(inputs.size());
-  for (int j = 0; j < inputs.size(); ++j) {
-    inputs_concat[j] = inputs[j];
-  }
-  size_t num = inputs.size();
-  int rows = 1;
-  auto dim_0 = inputs[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int out_rows = rows, out_cols = 0;
-  std::vector<int64_t> inputs_cols(inputs.size());
-  for (int i = 0; i < num; ++i) {
-    int t_cols = inputs[i]->numel() / rows;
-    out_cols += t_cols;
-    inputs_cols[i] = t_cols;
-  }
-  for (int k = 0; k < out_rows; ++k) {
-    float* dst_ptr = out->mutable_data<float>() + k * out_cols;
-    int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
-      int col_len = inputs_cols[j];
-      const float* src_prt = inputs[j]->data<float>() + k * col_len;
-      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
-      col_idx += col_len;
-    }
-  }
-}
-
-void test_concat(std::vector<vector<int64_t>> input, int axis) {
-  std::string x_var_name = "x";
-  std::string y_var_name = "y";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-
-  // prepare input&output variables
-  Scope scope;
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
-  x->Resize(DDim(input[0]));
-  y->Resize(DDim(input[1]));
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  CHECK_EQ(out->dims(), out_ref->dims());
-
-  // initialize input&output data
-  FillTensor<float>(x);
-  FillTensor<float>(y);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("concat");
-  opdesc.SetInput("X", {x_var_name, y_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  auto op = CreateOp<operators::ConcatOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name, y_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-  concat_ref(op);
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4);
-  }
-}
-
-TEST(NPUBridges, concat) {
-  test_concat({{3, 3, 5, 2}, {2, 3, 5, 2}}, 0);
-  test_concat({{3, 5, 5, 2}, {3, 1, 5, 2}}, 1);
-  test_concat({{3, 3, 2, 2}, {3, 3, 4, 2}}, 2);
-  test_concat({{3, 3, 5, 2}, {3, 3, 5, 6}}, 3);
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(concat);
-USE_NPU_BRIDGE(concat);
diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc
index 32f4d511d5d35a64a5e02a18a2b5ffa6d09d75cd..cc72242fb125699aa6236b78ebd17c32dd1dc66a 100644
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
@@ -12,53 +12,100 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/operators/conv_op.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
-                            const node_map_type& inputs_map) {
-  auto scope = conv_op->scope();
-  auto op_info = conv_op->op_info();
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " << op_type << "... ";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " << op_type << "... ";
 
-  // get input, filter and op attributes
-  auto input_var_name = op_info->Input("Input").front();
-  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  // Get input and output vars and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input_type = kernel->GetInputDeclType("Input");
+  CHECK(input_type->precision() == PRECISION(kFloat));
+  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
+  auto input = scope->FindMutableTensor(input_name);
   auto input_dims = input->dims();
-  auto output_var_name = op_info->Output("Output").front();
-  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
-  auto output_dims = output->dims();
-  auto filter_var_name = op_info->Input("Filter").front();
-  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter_type = kernel->GetInputDeclType("Filter");
+  CHECK(filter_type->precision() == PRECISION(kFloat));
+  CHECK(filter_type->layout() == DATALAYOUT(kNCHW));
+  auto filter = scope->FindMutableTensor(filter_name);
   auto filter_dims = filter->dims();
+
+  auto output_name = op_info->Output("Output").front();
+  auto output_type = kernel->GetOutputDeclType("Output");
+  CHECK(output_type->precision() == PRECISION(kFloat));
+  CHECK(output_type->layout() == DATALAYOUT(kNCHW));
+  auto output = scope->FindMutableTensor(output_name);
+  auto output_dims = output->dims();
+
   auto bs = input_dims[0];
   auto ic = input_dims[1];
   auto oc = filter_dims[0];
-  CHECK_EQ(input_dims.size(), 4);
-  CHECK_EQ(output_dims.size(), 4);
-  CHECK_EQ(filter_dims.size(), 4);
+  CHECK_EQ(input_dims.size(), 4L);
+  CHECK_EQ(output_dims.size(), 4L);
+  CHECK_EQ(filter_dims.size(), 4L);
   CHECK_EQ(output_dims[0], bs);
   CHECK_EQ(output_dims[1], oc);
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
   auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
   auto groups = op_info->GetAttr<int>("groups");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
-  auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  CHECK_EQ(strides.size(), 2);
-  CHECK_EQ(paddings.size(), 2);
-  CHECK_EQ(dilations.size(), 2);
+  bool with_act =
+      op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
+  std::string act_type =
+      with_act ? op_info->GetAttr<std::string>("act_type") : "";
+  float leaky_relu_alpha = act_type == "leaky_relu"
+                               ? op_info->GetAttr<float>("leaky_relu_alpha")
+                               : 0.f;
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  // Input node
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[NPU] Paddings size should be the same or twice as the input size.";
 
-  // check depthwise mode, and decide whether use ConvolutionDepthwise Op
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
+  // Check depthwise mode, and decide whether use ConvolutionDepthwise Op
   bool use_depthwise_conv =
-      false;  // whether use ge::op::ConvolutionDepthwise ?
+      false;  // Whether use ge::op::ConvolutionDepthwise ?
   bool is_depthwise_mode = ic == groups && oc == groups;
   if (is_depthwise_mode &&
       !((groups == 1 || groups >= 5) && dilations[0] == 1 &&
@@ -70,143 +117,135 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
                     "performance.";
   }
 
-  // check input
-  CHECK(inputs_map.count(input_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
-
-  // create filter node
-  CHECK(!inputs_map.count(filter_var_name));
-  auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
-  filter_const_node->set_attr_value(lite::npu::CvtTensor(filter));
-  lite::npu::OpList::Global().add(filter_const_node);
+  // Filter node
+  auto filter_node = graph->Add(filter_name, *filter);
 
-  // create bias node if has bias
-  // supports the bias nodes with the following dimensions
+  // Add bias node if exists bias
+  // Supports the bias nodes with the following dimensions
   // 0: {oc}
   // 1: {1, oc, oh, ow}
   // 2: {n, oc, oh, ow}
-  std::shared_ptr<ge::Operator> bias_node = nullptr;
+  std::shared_ptr<Node> bias_node = nullptr;
   bool is_channel_bias = false;
-  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
-    auto bias_var_name = op_info->Input("Bias").front();
-    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-    auto bias_dims = bias->dims();
-    auto bias_data_size = bias_dims.production();
-    auto output_data_size = output_dims.production();
-    std::vector<int64_t> bias_shape;
-    if (bias_data_size == oc) {
-      // 0: {oc}
-      bias_shape = {1, oc, 1, 1};
-      is_channel_bias = true;
-    } else if (bias_data_size == output_data_size / bs) {
-      // 1: {1, oc, oh, ow}
-      bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
-    } else if (bias_data_size == output_data_size) {
-      // 2: {n, oc, oh, ow}
-      bias_shape = output_dims.Vectorize();
-    } else {
-      LOG(ERROR) << "bias dimension " << bias_dims
-                 << " isn't supported in conv2d Op when output dimension is "
-                 << output_dims;
-    }
-    if (inputs_map.count(bias_var_name)) {
-      // bias node from input map
-      bias_node = inputs_map.at(bias_var_name);
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
     } else {
-      // bias node with const data
-      auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
-      bias_const_node->set_attr_value(lite::npu::CvtTensor(bias, bias_shape));
-      bias_node = bias_const_node;
+      auto bias_type = kernel->GetInputDeclType("Bias");
+      CHECK(bias_type->precision() == PRECISION(kFloat));
+      CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      auto output_data_size = output_dims.production();
+      std::vector<int64_t> bias_shape;
+      if (bias_data_size == oc) {
+        // 0: {oc}
+        bias_shape = {1, oc, 1, 1};
+        is_channel_bias = true;
+      } else if (bias_data_size == output_data_size / bs) {
+        // 1: {1, oc, oh, ow}
+        bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
+      } else if (bias_data_size == output_data_size) {
+        // 2: {n, oc, oh, ow}
+        bias_shape = output_dims.Vectorize();
+      } else {
+        LOG(WARNING)
+            << "[NPU] Bias dimension " << bias_dims
+            << " isn't supported in conv2d Op when output dimension is "
+            << output_dims;
+        return FAILED;
+      }
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
     }
-    lite::npu::OpList::Global().add(bias_node);
   }
 
-  // create conv node and set input, filter, bias nodes and attributes
-  std::shared_ptr<ge::Operator> conv_node = nullptr;
+  // Conv node
+  std::shared_ptr<Node> conv_node = nullptr;
   if (use_depthwise_conv && is_depthwise_mode) {
-    auto depthwise_conv_node =
-        std::make_shared<ge::op::ConvolutionDepthwise>(unique_op_type);
-    depthwise_conv_node->set_input_x(*inputs_map.at(input_var_name));
-    depthwise_conv_node->set_input_filter(*filter_const_node);
-    depthwise_conv_node->set_attr_mode(1);
-    depthwise_conv_node->set_attr_algo(0);
-    depthwise_conv_node->set_attr_format(0);    // NCHW
-    depthwise_conv_node->set_attr_pad_mode(5);  // VALID
-    depthwise_conv_node->set_attr_group(groups);
-    depthwise_conv_node->set_attr_pad(ge::AttrValue::LIST_INT(
-        {paddings[0], paddings[0], paddings[1], paddings[1]}));
-    depthwise_conv_node->set_attr_dilation(
+    conv_node = graph->Add<ge::op::ConvolutionDepthwise>(output_name);
+    auto conv_op = conv_node->data<ge::op::ConvolutionDepthwise>();
+    conv_op->set_input_x(*input_node->data());
+    conv_op->set_input_filter(*filter_node->data());
+    conv_op->set_attr_mode(1);
+    conv_op->set_attr_algo(0);
+    conv_op->set_attr_format(0);    // NCHW
+    conv_op->set_attr_pad_mode(5);  // VALID
+    conv_op->set_attr_group(groups);
+    conv_op->set_attr_pad(ge::AttrValue::LIST_INT(
+        {paddings[0], paddings[1], paddings[2], paddings[3]}));
+    conv_op->set_attr_dilation(
         ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
-    depthwise_conv_node->set_attr_stride(
-        ge::AttrValue::LIST_INT({strides[0], strides[1]}));
-    depthwise_conv_node->set_attr_kernel(
+    conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]}));
+    conv_op->set_attr_kernel(
         ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    lite::npu::OpList::Global().add(depthwise_conv_node);
-    conv_node = depthwise_conv_node;
     // ConvolutionDepthwise Op doesn't support bias, so append Add node to
     // support bias
     if (bias_node != nullptr) {
-      auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
-      add_node->set_input_x1(*depthwise_conv_node);
-      add_node->set_input_x2(*bias_node);
-      lite::npu::OpList::Global().add(add_node);
+      auto add_node = graph->Add<ge::op::Add>(output_name);
+      auto add_op = add_node->data<ge::op::Add>();
+      add_op->set_input_x1(*conv_node->data());
+      add_op->set_input_x2(*bias_node->data());
       conv_node = add_node;
     }
   } else {
-    auto common_conv_node =
-        std::make_shared<ge::op::Convolution>(unique_op_type);
-    common_conv_node->set_input_x(*inputs_map.at(input_var_name));
-    common_conv_node->set_input_w(*filter_const_node);
-    common_conv_node->set_attr_mode(1);
-    common_conv_node->set_attr_pad_mode(0);  // NOTSET
-    common_conv_node->set_attr_group(groups);
-    common_conv_node->set_attr_pad(ge::AttrValue::LIST_INT(
-        {paddings[0], paddings[0], paddings[1], paddings[1]}));
-    common_conv_node->set_attr_dilation(
+    conv_node = graph->Add<ge::op::Convolution>(output_name);
+    auto conv_op = conv_node->data<ge::op::Convolution>();
+    conv_op->set_input_x(*input_node->data());
+    conv_op->set_input_w(*filter_node->data());
+    conv_op->set_attr_mode(1);
+    // when padding_algorithm=="SAME", NPU is different from lite
+    if (padding_algorithm == "VALID") {
+      conv_op->set_attr_pad_mode(5);
+    } else {
+      conv_op->set_attr_pad_mode(0);
+    }
+    conv_op->set_attr_group(groups);
+    conv_op->set_attr_pad(ge::AttrValue::LIST_INT(
+        {paddings[0], paddings[1], paddings[2], paddings[3]}));
+    conv_op->set_attr_dilation(
         ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
-    common_conv_node->set_attr_stride(
-        ge::AttrValue::LIST_INT({strides[0], strides[1]}));
-    common_conv_node->set_attr_kernel(
+    conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]}));
+    conv_op->set_attr_kernel(
         ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    lite::npu::OpList::Global().add(common_conv_node);
-    conv_node = common_conv_node;
     // Convolution Op only support bias with dimension {1, oc, 1, 1},
     // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
     if (bias_node != nullptr) {
       if (is_channel_bias) {
-        common_conv_node->set_input_b(*bias_node);
+        conv_op->set_input_b(*bias_node->data());
       } else {
-        auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
-        add_node->set_input_x1(*common_conv_node);
-        add_node->set_input_x2(*bias_node);
-        lite::npu::OpList::Global().add(add_node);
+        auto add_node = graph->Add<ge::op::Add>(output_name);
+        auto add_op = add_node->data<ge::op::Add>();
+        add_op->set_input_x1(*conv_node->data());
+        add_op->set_input_x2(*bias_node->data());
         conv_node = add_node;
       }
     }
   }
   CHECK(conv_node);
 
-  node_map_type outputs_map;
-  if (fuse_relu) {
-    // append relu node if fuse_relu is true
-    auto relu_node =
-        std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
-    relu_node->set_input_x(*conv_node);
-    relu_node->set_attr_mode(lite::npu::CvtActMode("relu"));
-    lite::npu::OpList::Global().add(relu_node);
-    outputs_map[op_info->Output("Output").front()] = relu_node;
-  } else {
-    outputs_map[op_info->Output("Output").front()] = conv_node;
+  if (!act_type.empty()) {
+    auto act_node = graph->Add<ge::op::Activation>(output_name);
+    auto act_op = act_node->data<ge::op::Activation>();
+    act_op->set_input_x(*conv_node->data());
+    act_op->set_attr_mode(CvtActMode(act_type));
+    if (act_type == "leaky_relu") {
+      act_op->set_attr_negative_slope(leaky_relu_alpha);
+    }
   }
-  return outputs_map;
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(conv2d, paddle::lite::kernels::npu::bridges::ConvConverter);
-REGISTER_NPU_BRIDGE(depthwise_conv2d,
-                    paddle::lite::kernels::npu::bridges::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ConvConverter);
diff --git a/lite/kernels/npu/bridges/conv_op_test.cc b/lite/kernels/npu/bridges/conv_op_test.cc
deleted file mode 100644
index 26309aa9e27a1f0a5f6093b44242434d9e29a173..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/conv_op_test.cc
+++ /dev/null
@@ -1,282 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/conv_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto input =
-      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
-  auto filter =
-      scope->FindVar(op_info->Input("Filter").front())->GetMutable<Tensor>();
-  auto output =
-      scope->FindVar(op_info->Output("Output").front())->GetMutable<Tensor>();
-  std::vector<int32_t> strides =
-      op_info->GetAttr<std::vector<int32_t>>("strides");
-  std::vector<int32_t> paddings =
-      op_info->GetAttr<std::vector<int32_t>>("paddings");
-  int32_t groups = op_info->GetAttr<int32_t>("groups");
-  std::vector<int32_t> dilations =
-      op_info->GetAttr<std::vector<int32_t>>("dilations");
-  bool fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  auto input_dims = input->dims();
-  auto filter_dims = filter->dims();
-  auto output_dims = output->dims();
-  auto input_data = input->mutable_data<float>();
-  auto filter_data = filter->mutable_data<float>();
-  auto output_data = output->mutable_data<float>();
-  int kernel_w = filter_dims[3];
-  int kernel_h = filter_dims[2];
-  int stride_w = strides[1];
-  int stride_h = strides[0];
-  int dila_w = dilations[1];
-  int dila_h = dilations[0];
-  int pad_w = paddings[1];
-  int pad_h = paddings[0];
-  int batch_size = input_dims[0];
-  int in_ch_size = input_dims[1];
-  int in_h = input_dims[2];
-  int in_w = input_dims[3];
-  int out_ch_size = output_dims[1];
-  int out_h = output_dims[2];
-  int out_w = output_dims[3];
-  int out_c_group = out_ch_size / groups;
-  int in_c_group = in_ch_size / groups;
-  Tensor* bias = nullptr;
-  float* bias_data = nullptr;
-  bool is_channel_bias = false;
-  if (op_info->HasInput("Bias")) {
-    auto bias_var_names = op_info->Input("Bias");
-    if (bias_var_names.size() > 0) {
-      auto bias_var_name = bias_var_names.front();
-      bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-      auto bias_dims = bias->dims();
-      is_channel_bias = bias_dims.production() == out_ch_size;
-      bias_data = bias->mutable_data<float>();
-    }
-  }
-  for (int n = 0; n < batch_size; ++n) {
-    for (int g = 0; g < groups; ++g) {
-      for (int oc = 0; oc < out_c_group; ++oc) {
-        for (int oh = 0; oh < out_h; ++oh) {
-          for (int ow = 0; ow < out_w; ++ow) {
-            int out_idx = n * groups * out_c_group * out_h * out_w +
-                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
-                          oh * out_w + ow;
-            float out_value =
-                bias_data != nullptr
-                    ? (is_channel_bias ? bias_data[g * out_c_group + oc]
-                                       : bias_data[out_idx])
-                    : 0;
-            // + out_value *= beta;
-            for (int ic = 0; ic < in_c_group; ++ic) {
-              for (int kh = 0; kh < kernel_h; ++kh) {
-                for (int kw = 0; kw < kernel_w; ++kw) {
-                  int iw = ow * stride_w - pad_w + kw * (dila_w);
-                  int ih = oh * stride_h - pad_h + kh * (dila_h);
-                  if (iw < 0 || iw >= in_w) continue;
-                  if (ih < 0 || ih >= in_h) continue;
-                  int in_idx = n * in_ch_size * in_h * in_w +
-                               g * in_c_group * in_h * in_w + ic * in_h * in_w +
-                               ih * in_w + iw;
-                  int filter_idx =
-                      g * out_c_group * in_c_group * kernel_h * kernel_w +
-                      oc * in_c_group * kernel_h * kernel_w +
-                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
-                  out_value += input_data[in_idx] * filter_data[filter_idx];
-                }
-              }
-            }
-            if (fuse_relu) {
-              out_value = out_value > 0 ? out_value : 0;
-            }
-            output_data[out_idx] = out_value;
-          }
-        }
-      }
-    }
-  }
-}
-
-void test_conv(int bs,
-               int ic,
-               int oc,
-               int ih,
-               int iw,
-               bool has_bias,
-               bool is_channel_bias,
-               bool fuse_relu,
-               bool depthwise,
-               int dilation,
-               int stride,
-               int padding,
-               int kernel) {
-  // prepare input&output variables
-  Scope scope;
-  std::string input_var_name("input");
-  std::string filter_var_name("filter");
-  std::string bias_var_name("bias");
-  std::string output_var_name("output");
-  std::string output_ref_var_name("output_ref");
-  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
-  auto* filter = scope.Var(filter_var_name)->GetMutable<Tensor>();
-  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-  auto* output = scope.Var(output_var_name)->GetMutable<Tensor>();
-  auto* output_ref = scope.Var(output_ref_var_name)->GetMutable<Tensor>();
-
-  // get group size and input&filter shape
-  int groups = 1;
-  if (depthwise) {  // depthwise convolution ?
-    groups = oc = ic;
-  }
-  std::vector<int64_t> input_shape = {bs, ic, ih, iw};
-  std::vector<int64_t> filter_shape = {oc, ic / groups, kernel, kernel};
-  std::vector<int64_t> output_shape({bs, oc});
-  for (size_t i = 0; i < 2; i++) {
-    const int dkernel = dilation * (kernel - 1) + 1;
-    int output_size = (input_shape[i + 2] + 2 * padding - dkernel) / stride + 1;
-    output_shape.push_back(output_size);
-  }
-  input->Resize(input_shape);
-  filter->Resize(filter_shape);
-
-  // initialize input&output data
-  FillTensor<float, int>(input);
-  FillTensor<float, int>(filter);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType(depthwise ? "depthwise_conv2d" : "conv2d");
-  opdesc.SetInput("Input", {input_var_name});
-  opdesc.SetInput("Filter", {filter_var_name});
-  opdesc.SetOutput("Output", {output_var_name});
-  opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
-  opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
-  opdesc.SetAttr("groups", groups);
-  opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
-  if (has_bias) {
-    if (is_channel_bias) {
-      bias->Resize({1, oc, 1, 1});
-    } else {
-      bias->Resize({output_shape});
-    }
-    FillTensor<float, int>(bias);
-    opdesc.SetInput("Bias", {bias_var_name});
-  }
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ConvOpLite>(opdesc, &scope);
-  LauchOp(op, {input_var_name}, {output_var_name});
-  output_ref->CopyDataFrom(*output);
-
-  // execute reference implementation and save to output tensor('out')
-  conv_ref(op);
-
-  // compare results
-  auto* output_data = output->mutable_data<float>();
-  auto* output_ref_data = output_ref->mutable_data<float>();
-  for (int i = 0; i < output->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, conv) {
-#if 1
-  for (auto bs : {1, 2}) {
-    for (auto ic : {3, 6}) {
-      for (auto oc : {6, 9}) {
-        for (auto ih : {14, 28}) {
-          for (auto iw : {14, 28}) {
-            for (auto has_bias : {false, true}) {
-              for (auto is_channel_bias : {false, true}) {
-                for (auto fuse_relu : {false, true}) {
-                  for (auto depthwise : {false, true}) {
-                    for (auto dilation : {1, 2}) {
-                      for (auto stride : {1, 2}) {
-                        for (auto kernel : {1, 3, 5}) {
-                          std::vector<int> paddings = {kernel / 2};
-                          if (kernel / 2 != 0) {
-                            paddings.push_back(0);
-                          }
-                          for (auto padding : paddings) {
-                            VLOG(3) << "bs: " << bs << " ic: " << ic
-                                    << " oc: " << oc << " ih: " << ih
-                                    << " iw: " << iw
-                                    << " has_bias: " << has_bias
-                                    << " is_channel_bias: " << is_channel_bias
-                                    << " fuse_relu: " << fuse_relu
-                                    << " depthwise: " << depthwise
-                                    << " dilation: " << dilation
-                                    << " stride: " << stride
-                                    << " padding: " << padding
-                                    << " kernel: " << kernel;
-                            test_conv(bs,
-                                      ic,
-                                      oc,
-                                      ih,
-                                      iw,
-                                      has_bias,
-                                      is_channel_bias,
-                                      fuse_relu,
-                                      depthwise,
-                                      dilation,
-                                      stride,
-                                      padding,
-                                      kernel);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#else
-  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 1, 3);
-  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 3);
-  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 2, 5);
-  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 5);
-#endif
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(conv2d);
-USE_NPU_BRIDGE(conv2d);
-
-USE_LITE_OP(depthwise_conv2d);
-USE_NPU_BRIDGE(depthwise_conv2d);
diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc
index 04f75a91b8442c50e071019a6052c583104274de..adade8844b65cef560b6f183ea0e2a63f05ccb6b 100644
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
@@ -12,131 +12,182 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+#include "lite/operators/conv_op.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ConvTransposeConverter(
-    const std::shared_ptr<lite::OpLite> conv_transpose_op,
-    const node_map_type& inputs_map) {
-  auto scope = conv_transpose_op->scope();
-  auto op_info = conv_transpose_op->op_info();
+int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " << op_type << "... ";
-
-  // get input, output and op attributes
-  auto input_var_name = op_info->Input("Input").front();
-  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
-  auto input_shape = input->dims().Vectorize();
-  auto filter_var_name = op_info->Input("Filter").front();
-  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
-  auto filter_shape = filter->dims().Vectorize();
-  CHECK_EQ(input_shape.size(), 4);
-  CHECK_EQ(filter_shape.size(), 4);
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " << op_type << "... ";
+
+  // Get input, output and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input_type = kernel->GetInputDeclType("Input");
+  CHECK(input_type->precision() == PRECISION(kFloat));
+  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  CHECK_EQ(input_dims.size(), 4);
+
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter_type = kernel->GetInputDeclType("Filter");
+  CHECK(filter_type->precision() == PRECISION(kFloat));
+  CHECK(filter_type->layout() == DATALAYOUT(kNCHW));
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+  CHECK_EQ(filter_dims.size(), 4);
+
+  auto output_name = op_info->Output("Output").front();
+  auto output_type = kernel->GetOutputDeclType("Output");
+  CHECK(output_type->precision() == PRECISION(kFloat));
+  CHECK(output_type->layout() == DATALAYOUT(kNCHW));
+
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  CHECK_EQ(strides.size(), 2L);
   auto groups = op_info->GetAttr<int>("groups");
+  if (groups > 1) {
+    LOG(WARNING) << "[NPU] only support groups == 1";
+    return FAILED;
+  }
+
+  auto fuse_relu =
+      op_info->HasAttr("fuse_relu") && op_info->GetAttr<bool>("fuse_relu");
+  std::vector<int> output_size;
+  if (op_info->HasAttr("output_size")) {
+    output_size = op_info->GetAttr<std::vector<int>>("output_size");
+  }
+
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
-  auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  CHECK_EQ(strides.size(), 2);
-  CHECK_EQ(paddings.size(), 2);
-  CHECK_EQ(dilations.size(), 2);
-
-  // create deconv node
-  auto conv_transpose_node =
-      std::make_shared<ge::op::Deconvolution>(unique_op_type);
-
-  // create input sizes node to describe the dimensions of input tensor
-  std::vector<int32_t> output_shape;
-  output_shape.push_back(input_shape[0]);
-  output_shape.push_back(filter_shape[1] * groups);
+  CHECK_EQ(dilations.size(), 2L);
+  std::string padding_algorithm =
+      op_info->HasAttr("padding_algorithm")
+          ? op_info->GetAttr<std::string>("padding_algorithm")
+          : "";
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[NPU] Paddings size should be the same or twice as the input size.";
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+  if (paddings[0] != paddings[1] || paddings[2] != paddings[3]) {
+    LOG(WARNING) << "[NPU] only support \"pad_top == pad_bottom && pad_left == "
+                    "pad_right\" .";
+    return FAILED;
+  }
+
+  // Input node
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+
+  // Create input sizes node to describe the dimensions of input tensor
+  std::vector<int32_t> input_sizes;
+  input_sizes.push_back(input_dims[0]);
+  input_sizes.push_back(filter_dims[1] * groups);
   for (int i = 0; i < strides.size(); i++) {
-    int kernel_ext = dilations[i] * (filter_shape[i + 2] - 1) + 1;
-    int output_size =
-        (input_shape[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i];
-    output_shape.push_back(output_size);
+    int kernel_ext = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+    int output_size = (input_dims[i + 2] - 1) * strides[i] + kernel_ext -
+                      paddings[i * 2] - paddings[i * 2 + 1];
+    input_sizes.push_back(output_size);
+  }
+  if (!output_size.empty()) {
+    CHECK_EQ(output_size.size(), 2L);
+    if (output_size[0] != input_sizes[2] || output_size[1] != input_sizes[3]) {
+      LOG(WARNING) << "[NPU] not support output_size: " << output_size[0]
+                   << ", " << output_size[1];
+      return FAILED;
+    }
   }
-  auto input_sizes_const_node =
-      std::make_shared<ge::op::Const>(unique_op_type + "/input_size");
-  input_sizes_const_node->set_attr_value(
-      lite::npu::CreateTensorAndFillData(output_shape));
-  conv_transpose_node->set_input_input_sizes(*input_sizes_const_node);
-  lite::npu::OpList::Global().add(input_sizes_const_node);
-
-  // create filter node
-  CHECK(!inputs_map.count(filter_var_name));
-  auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
-  filter_const_node->set_attr_value(lite::npu::CvtTensor(filter));
-  conv_transpose_node->set_input_filter(*filter_const_node);
-  lite::npu::OpList::Global().add(filter_const_node);
-
-  // set input node
-  CHECK(inputs_map.count(input_var_name));
-  conv_transpose_node->set_input_x(*inputs_map.at(input_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
-
-  // set attributes
-  conv_transpose_node->set_attr_format(0);    // NCHW
-  conv_transpose_node->set_attr_pad_mode(0);  // NOTSET
-  conv_transpose_node->set_attr_group(groups);
-  conv_transpose_node->set_attr_pad(ge::AttrValue::LIST_INT(
-      {paddings[0], paddings[0], paddings[1], paddings[1]}));
-  conv_transpose_node->set_attr_dilation(
+  auto input_sizes_node = graph->Add(output_name + "/input_sizes", input_sizes);
+
+  // Filter node
+  auto filter_node = graph->Add(filter_name, *filter);
+
+  // Deconv node
+  auto conv_transpose_node = graph->Add<ge::op::Deconvolution>(output_name);
+  auto conv_transpose_op = conv_transpose_node->data<ge::op::Deconvolution>();
+  conv_transpose_op->set_input_input_sizes(*input_sizes_node->data());
+  conv_transpose_op->set_input_filter(*filter_node->data());
+  conv_transpose_op->set_input_x(*input_node->data());
+  // Set attributes
+  conv_transpose_op->set_attr_format(0);  // NCHW
+  // "SAME" is different from paddle
+  if (padding_algorithm == "VALID") {
+    conv_transpose_op->set_attr_pad_mode(5);
+  } else {
+    conv_transpose_op->set_attr_pad_mode(0);  // NOTSET
+  }
+  conv_transpose_op->set_attr_group(groups);
+  conv_transpose_op->set_attr_pad(ge::AttrValue::LIST_INT(
+      {paddings[0], paddings[1], paddings[2], paddings[3]}));
+  conv_transpose_op->set_attr_dilation(
       ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
-  conv_transpose_node->set_attr_stride(
+  conv_transpose_op->set_attr_stride(
       ge::AttrValue::LIST_INT({strides[0], strides[1]}));
-  conv_transpose_node->set_attr_kernel(
-      ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]}));
-  lite::npu::OpList::Global().add(conv_transpose_node);
-
-  // append add node to add bias if has bias
-  std::shared_ptr<ge::Operator> output_node = conv_transpose_node;
-  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
-    // create bias node
-    auto bias_var_name = op_info->Input("Bias").front();
-    CHECK(!inputs_map.count(bias_var_name));
-    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-    auto channel_size = bias->dims().production();
-    CHECK_EQ(channel_size, filter_shape[1] * groups);
-    auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
-    bias_const_node->set_attr_value(
-        lite::npu::CvtTensor(bias, {1, channel_size, 1, 1}));
-    lite::npu::OpList::Global().add(bias_const_node);
-    // append add node to add bias node
-    auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
-    add_node->set_input_x1(*conv_transpose_node);
-    add_node->set_input_x2(*bias_const_node);
-    lite::npu::OpList::Global().add(add_node);
-    output_node = add_node;
+  conv_transpose_op->set_attr_kernel(
+      ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
+
+  // Append add node to add bias if exists bias
+  if (HasInputArg(op_info, scope, "Bias")) {
+    std::shared_ptr<Node> bias_node = nullptr;
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias_type = kernel->GetInputDeclType("Bias");
+      CHECK(bias_type->precision() == PRECISION(kFloat));
+      CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto channel_size = bias->dims().production();
+      CHECK_EQ(channel_size, filter_dims[1] * groups);
+      bias_node = graph->Add(bias_name, *bias, {1, channel_size, 1, 1});
+    }
+    // Append add node to add bias node
+    auto add_node = graph->Add<ge::op::Add>(output_name);
+    auto add_op = add_node->data<ge::op::Add>();
+    add_op->set_input_x1(*conv_transpose_node->data());
+    add_op->set_input_x2(*bias_node->data());
+    conv_transpose_node = add_node;
   }
 
-  node_map_type outputs_map;
   if (fuse_relu) {
-    // append relu node if fuse_relu is true
-    auto relu_node =
-        std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
-    relu_node->set_input_x(*output_node);
-    relu_node->set_attr_mode(lite::npu::CvtActMode("relu"));
-    lite::npu::OpList::Global().add(relu_node);
-    outputs_map[op_info->Output("Output").front()] = relu_node;
-  } else {
-    outputs_map[op_info->Output("Output").front()] = output_node;
+    // Append relu node if fuse_relu is true
+    auto relu_node = graph->Add<ge::op::Activation>(output_name);
+    auto relu_op = relu_node->data<ge::op::Activation>();
+    relu_op->set_input_x(*conv_transpose_node->data());
+    relu_op->set_attr_mode(CvtActMode("relu"));
   }
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(
-    conv2d_transpose,
-    paddle::lite::kernels::npu::bridges::ConvTransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ConvTransposeConverter);
diff --git a/lite/kernels/npu/bridges/conv_transpose_op_test.cc b/lite/kernels/npu/bridges/conv_transpose_op_test.cc
deleted file mode 100644
index a009ef588e1ddf9561f895e977fbb08a98b2d51b..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/conv_transpose_op_test.cc
+++ /dev/null
@@ -1,371 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/conv_transpose_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename DType>
-void add_bias_with_relu(DType* data,
-                        const DType* bias,
-                        int channel_size,
-                        int inner_size,
-                        bool has_relu) {
-  for (int c = 0; c < channel_size; ++c) {
-    DType bias_val = bias != nullptr ? bias[c] : 0;
-    for (int i = 0; i < inner_size; i++) {
-      DType data_val = data[i];
-      data_val += bias_val;
-      if (has_relu) {
-        data_val = data_val > 0 ? data_val : 0.f;
-      }
-      data[i] = data_val;
-    }
-    data += inner_size;
-  }
-}
-
-template <typename DType>
-void col2im(const DType* data_col,
-            const int channel_size,
-            const int height,
-            const int width,
-            const int kernel_h,
-            const int kernel_w,
-            const int pad_h,
-            const int pad_w,
-            const int stride_h,
-            const int stride_w,
-            const int dilation_h,
-            const int dilation_w,
-            DType* data_im) {
-  memset(data_im, 0, height * width * channel_size * sizeof(DType));
-  const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-  const int inner_size = height * width;
-  for (int c = channel_size; c--; data_im += inner_size) {
-    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
-        for (int output_rows = output_h; output_rows; output_rows--) {
-          if (input_row < 0 || input_row >= height) {
-            data_col += output_w;
-          } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
-            for (int output_col = output_w; output_col; output_col--) {
-              if (input_col >= 0 && input_col < width) {
-                data_im[input_row * width + input_col] += *data_col;
-              }
-              data_col++;
-              input_col += stride_w;
-            }
-          }
-          input_row += stride_h;
-        }
-      }
-    }
-  }
-}
-
-template <typename IType, typename OType>
-void gemm(int M,
-          int N,
-          int K,
-          const IType* A,
-          const IType* B,
-          OType* C,
-          OType alpha,
-          OType beta,
-          bool is_trans_A = false,
-          bool is_trans_B = false) {
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      OType sum = static_cast<OType>(0);
-      for (int k = 0; k < K; ++k) {
-        IType a;
-        IType b;
-        if (is_trans_A) {
-          a = A[k * M + m];
-        } else {
-          a = A[m * K + k];
-        }
-        if (is_trans_B) {
-          b = B[n * K + k];
-        } else {
-          b = B[k * N + n];
-        }
-        sum += a * b;
-      }
-      C[m * N + n] = alpha * sum + beta * C[m * N + n];
-    }
-  }
-}
-
-template <typename IType, typename OType>
-void conv_transpose_ref(
-    const std::shared_ptr<operators::ConvTransposeOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto input =
-      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
-  auto filter =
-      scope->FindVar(op_info->Input("Filter").front())->GetMutable<Tensor>();
-  auto output =
-      scope->FindVar(op_info->Output("Output").front())->GetMutable<Tensor>();
-  std::vector<int32_t> strides =
-      op_info->GetAttr<std::vector<int32_t>>("strides");
-  std::vector<int32_t> paddings =
-      op_info->GetAttr<std::vector<int32_t>>("paddings");
-  int32_t groups = op_info->GetAttr<int32_t>("groups");
-  std::vector<int32_t> dilations =
-      op_info->GetAttr<std::vector<int32_t>>("dilations");
-  bool fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  Tensor* bias = nullptr;
-  OType* bias_data = nullptr;
-  if (op_info->HasInput("Bias")) {
-    auto bias_var_names = op_info->Input("Bias");
-    if (bias_var_names.size() > 0) {
-      auto bias_var_name = bias_var_names.front();
-      bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
-      bias_data = bias->mutable_data<OType>();
-    }
-  }
-  auto input_dims = input->dims();
-  auto filter_dims = filter->dims();
-  auto output_dims = output->dims();
-  auto input_data = input->mutable_data<IType>();
-  auto filter_data = filter->mutable_data<IType>();
-  auto output_data = output->mutable_data<OType>();
-  int kernel_w = filter_dims[3];
-  int kernel_h = filter_dims[2];
-  int stride_w = strides[1];
-  int stride_h = strides[0];
-  int dila_w = dilations[1];
-  int dila_h = dilations[0];
-  int pad_w = paddings[1];
-  int pad_h = paddings[0];
-  int batch_size = input_dims[0];
-  int in_ch_size = input_dims[1];
-  int in_h = input_dims[2];
-  int in_w = input_dims[3];
-  int out_ch_size = output_dims[1];
-  int out_h = output_dims[2];
-  int out_w = output_dims[3];
-
-  int M = out_ch_size * kernel_w * kernel_h / groups;
-  int N = in_h * in_w;
-  int K = in_ch_size / groups;
-
-  if (in_ch_size != out_ch_size || groups != in_ch_size) {
-    CHECK_EQ(in_ch_size % groups, 0);
-    CHECK_EQ(out_ch_size % groups, 0);
-  }
-
-  auto workspace = std::vector<OType>(groups * M * N);
-  int group_input_size = in_w * in_h * in_ch_size / groups;
-  int group_output_size = out_w * out_h * out_ch_size / groups;
-  int group_col_size = M * N;
-  int group_filter_size =
-      in_ch_size * out_ch_size * kernel_w * kernel_h / (groups * groups);
-  bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
-  for (int n = 0; n < batch_size; ++n) {
-    input_data += n * in_ch_size * in_h * in_w;
-    output_data += n * out_ch_size * out_h * out_w;
-    auto col_data = workspace.data();
-    if (flag_1x1s1p1) {
-      col_data = output_data;
-    }
-    memset(col_data, 0, sizeof(OType) * group_col_size);
-    for (int g = 0; g < groups; ++g) {
-      auto input_group_data = input_data + g * group_input_size;
-      auto filter_group_data = filter_data + g * group_filter_size;
-      auto col_group_data = col_data + g * group_col_size;
-      gemm<IType, OType>(M,
-                         N,
-                         K,
-                         filter_group_data,
-                         input_group_data,
-                         col_group_data,
-                         static_cast<OType>(1),
-                         static_cast<OType>(0),
-                         true,
-                         false);
-    }
-    if (!flag_1x1s1p1) {
-      col2im(col_data,
-             out_ch_size,
-             out_h,
-             out_w,
-             kernel_h,
-             kernel_w,
-             pad_h,
-             pad_w,
-             stride_h,
-             stride_w,
-             dila_h,
-             dila_w,
-             output_data);
-    }
-    add_bias_with_relu(
-        output_data, bias_data, out_ch_size, out_w * out_h, fuse_relu);
-  }
-}
-
-void test_conv_transpose(int bs,
-                         int ic,
-                         int ih,
-                         int iw,
-                         bool has_bias,
-                         bool fuse_relu,
-                         int filters,
-                         int groups,
-                         int dilation,
-                         int stride,
-                         int padding,
-                         int kernel) {
-  // prepare input&output variables
-  Scope scope;
-  std::string input_var_name("input");
-  std::string filter_var_name("filter");
-  std::string bias_var_name("bias");
-  std::string output_var_name("output");
-  std::string output_ref_var_name("output_ref");
-  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
-  auto* filter = scope.Var(filter_var_name)->GetMutable<Tensor>();
-  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-  auto* output = scope.Var(output_var_name)->GetMutable<Tensor>();
-  auto* output_ref = scope.Var(output_ref_var_name)->GetMutable<Tensor>();
-
-  // get group size and input&filter shape
-  std::vector<int64_t> input_shape = {bs, ic, ih, iw};
-  std::vector<int64_t> filter_shape = {ic, filters, kernel, kernel};
-  input->Resize(input_shape);
-  filter->Resize(filter_shape);
-
-  // initialize input&output data
-  FillTensor<float, int>(input);
-  FillTensor<float, int>(filter);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("conv2d_transpose");
-  opdesc.SetInput("Input", {input_var_name});
-  opdesc.SetInput("Filter", {filter_var_name});
-  opdesc.SetOutput("Output", {output_var_name});
-  opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
-  opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
-  opdesc.SetAttr("groups", groups);
-  opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
-  if (has_bias) {
-    bias->Resize({1, filters * groups, 1, 1});
-    FillTensor<float, int>(bias);
-    opdesc.SetInput("Bias", {bias_var_name});
-  }
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ConvTransposeOpLite>(opdesc, &scope);
-  LauchOp(op, {input_var_name}, {output_var_name});
-  output_ref->CopyDataFrom(*output);
-
-  // execute reference implementation and save to output tensor('out')
-  conv_transpose_ref<float, float>(op);
-
-  // compare results
-  auto* output_data = output->mutable_data<float>();
-  auto* output_ref_data = output_ref->mutable_data<float>();
-  for (int i = 0; i < output->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, conv_transpose) {
-#if 1
-  for (auto bs : {1, 2}) {
-    for (auto ic : {3, 6}) {
-      for (auto ih : {14, 28}) {
-        for (auto iw : {14, 28}) {
-          for (auto has_bias : {false, true}) {
-            for (auto fuse_relu : {false, true}) {
-              for (auto filters : {1, 2, 5}) {
-                for (auto groups : {1 /* , 2, 5*/}) {
-                  for (auto dilation : {1, 2}) {
-                    for (auto stride : {1, 2}) {
-                      for (auto kernel : {1, 3, 5}) {
-                        std::vector<int> paddings = {kernel / 2};
-                        if (kernel / 2 != 0) {
-                          paddings.push_back(0);
-                        }
-                        for (auto padding : paddings) {
-                          VLOG(3) << "bs: " << bs << " ic: " << ic
-                                  << " ih: " << ih << " iw: " << iw
-                                  << " has_bias: " << has_bias
-                                  << " fuse_relu: " << fuse_relu
-                                  << " filters: " << filters
-                                  << " groups: " << groups
-                                  << " dilation: " << dilation
-                                  << " stride: " << stride
-                                  << " padding: " << padding
-                                  << " kernel: " << kernel;
-                          test_conv_transpose(bs,
-                                              ic,
-                                              ih,
-                                              iw,
-                                              has_bias,
-                                              fuse_relu,
-                                              filters,
-                                              groups,
-                                              dilation,
-                                              stride,
-                                              padding,
-                                              kernel);
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#else
-  test_conv_transpose(1, 6, 8, 8, false, false, 5, 2, 1, 1, 1, 3);
-#endif
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(conv2d_transpose);
-USE_NPU_BRIDGE(conv2d_transpose);
diff --git a/lite/kernels/npu/bridges/dropout_op.cc b/lite/kernels/npu/bridges/dropout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0bb57673281bc3e9dd92fabd6ca5a8e76c76cb73
--- /dev/null
+++ b/lite/kernels/npu/bridges/dropout_op.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto x_rank = x_dims.size();
+  CHECK_GE(x_rank, 2);
+
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+
+  auto dropout_implementation =
+      op_info->GetAttr<std::string>("dropout_implementation");
+  auto scale = 1 - op_info->GetAttr<float>("dropout_prob");
+  if (dropout_implementation == "upscale_in_train") {
+    scale = 1.f;
+  }
+  // HiAI only support [n, c, 1, 1] for the shape of scale
+  std::vector<int64_t> scale_shape = {
+      1, x_rank < 3 ? 1 : x_dims[x_rank - 3], 1, 1};
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x, CvtShape(x_dims));
+  }
+
+  // Scale node
+  auto scale_node = graph->Add<ge::op::Scale>(out_name);
+  auto scale_op = scale_node->data<ge::op::Scale>();
+  scale_op->set_input_x(*x_node->data());
+  scale_op->set_attr_axis(1);
+
+  // Add filter node(fill with scale)
+  auto filter_node = graph->Add(out_name + "/filter", scale, scale_shape);
+  scale_op->set_input_filter(*filter_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(dropout,
+                         kNPU,
+                         paddle::lite::subgraph::npu::DropoutConverter);
diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc
index 2ec757ab14bf13eee323fa35df5ff592622ca4cf..c6ff56de67ccb7c257c08db343be2e4767938900 100644
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
@@ -12,77 +12,209 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ElementwiseConverter(
-    const std::shared_ptr<lite::OpLite> elementwise_op,
-    const node_map_type& inputs_map) {
-  auto scope = elementwise_op->scope();
-  auto op_info = elementwise_op->op_info();
+void CvtYShape(std::vector<int64_t>* x_shape,
+               std::vector<int64_t>* y_shape,
+               int axis) {
+  CHECK_GE(x_shape->size(), y_shape->size());
+
+  if (axis < 0) {
+    axis = x_shape->size() - y_shape->size();
+  }
+
+  // only support:
+  // (n,c,h,w) * (n,c,h,w)
+  // (n,c,h,w) * (1,c,1,1)
+  // (n,c,h,w) * (1,c,h,1)
+  // (n,c,h,w) * (1,c,h,w)
+  int y_shape_size = y_shape->size();
+  if (y_shape_size == 1) {
+    y_shape->insert(y_shape->begin(), 1);
+    y_shape->insert(y_shape->end(), 2, 1);
+  } else if (y_shape_size == 2) {
+    y_shape->insert(y_shape->begin(), 1);
+    y_shape->insert(y_shape->end(), 1);
+  } else if (y_shape_size == 3) {
+    y_shape->insert(y_shape->begin(), 1);
+  }
+  if (y_shape_size < 4) {
+    int n = 1;
+    for (int i = 0; i < axis; i++) {
+      n *= x_shape->at(i);
+    }
+    x_shape->erase(x_shape->begin(), x_shape->begin() + axis);
+    x_shape->insert(x_shape->begin(), n);
+    x_shape->insert(x_shape->end(), 4 - x_shape->size(), 1);
+  }
+
+  CHECK_EQ(x_shape->size(), 4UL);
+  CHECK_EQ(y_shape->size(), 4UL);
+}
+
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
 
-  std::shared_ptr<ge::op::Eltwise> elementwise_node =
-      std::make_shared<ge::op::Eltwise>(unique_op_type);
+  auto y_name = op_info->Input("Y").front();
+  auto y_type = kernel->GetInputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
 
-  auto x_var_name = op_info->Input("X").front();
-  auto y_var_name = op_info->Input("Y").front();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
 
-  CHECK_EQ(op_info->GetAttr<int>("axis"), -1)
-      << "[NPU] elementwise only support inputs with same size";
+  auto axis = op_info->GetAttr<int>("axis");
 
-  CHECK(inputs_map.find(x_var_name) != inputs_map.end());
-  elementwise_node->set_input_x1(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  auto x_new_shape = x_dims.Vectorize();
+  auto y_new_shape = y_dims.Vectorize();
+  CvtYShape(&x_new_shape, &y_new_shape, axis);
 
-  if (inputs_map.find(y_var_name) != inputs_map.end()) {
-    elementwise_node->set_input_x2(*inputs_map.at(y_var_name));
-    lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+    if (x_dims.Vectorize() != x_new_shape) {
+      auto reshaped_x_node = graph->Add<ge::op::Reshape>(x_name + "/reshape");
+      auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
+      reshaped_x_op->set_input_tensor(*x_node->data());
+      reshaped_x_op->set_attr_shape(
+          ge::AttrValue::LIST_INT(x_new_shape.begin(), x_new_shape.end()));
+      reshaped_x_op->set_attr_axis(0);
+      x_node = reshaped_x_node;
+    }
   } else {
-    auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
-    auto* y = scope->FindVar(y_var_name)->GetMutable<Tensor>();
-    y_const_node->set_attr_value(lite::npu::CvtTensor(y));
-    elementwise_node->set_input_x2(*y_const_node);
-    lite::npu::OpList::Global().add(y_const_node);
+    x_node = graph->Add(x_name, *x, x_new_shape);
   }
 
-  lite::npu::OpList::Global().add(elementwise_node);
+  // Y node
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+    if (y_dims.Vectorize() != y_new_shape) {
+      auto reshaped_y_node = graph->Add<ge::op::Reshape>(y_name + "/reshape");
+      auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
+      reshaped_y_op->set_input_tensor(*y_node->data());
+      reshaped_y_op->set_attr_shape(
+          ge::AttrValue::LIST_INT(y_new_shape.begin(), y_new_shape.end()));
+      reshaped_y_op->set_attr_axis(0);
+      y_node = reshaped_y_node;
+    }
+  } else {
+    y_node = graph->Add(y_name, *y, y_new_shape);
+  }
+
+  // Elementwise node
+  std::shared_ptr<Node> elt_node = nullptr;
+  if (op_type == "elementwise_add" ||
+      op_type == "fusion_elementwise_add_activation") {
+    elt_node = graph->Add<ge::op::Add>(out_name);
+    auto elt_op = elt_node->data<ge::op::Add>();
+    elt_op->set_input_x1(*x_node->data());
+    elt_op->set_input_x2(*y_node->data());
+  } else if (op_type == "elementwise_sub" ||
+             op_type == "fusion_elementwise_sub_activation") {
+    elt_node = graph->Add<ge::op::Sub>(out_name);
+    auto elt_op = elt_node->data<ge::op::Sub>();
+    elt_op->set_input_x1(*x_node->data());
+    elt_op->set_input_x2(*y_node->data());
+  } else if (op_type == "elementwise_mul" ||
+             op_type == "fusion_elementwise_mul_activation") {
+    elt_node = graph->Add<ge::op::Mul>(out_name);
+    auto elt_op = elt_node->data<ge::op::Mul>();
+    elt_op->set_input_x(*x_node->data());
+    elt_op->set_input_y(*y_node->data());
+  } else if (op_type == "elementwise_div" ||
+             op_type == "fusion_elementwise_div_activation") {
+    elt_node = graph->Add<ge::op::RealDiv>(out_name);
+    auto elt_op = elt_node->data<ge::op::RealDiv>();
+    elt_op->set_input_x1(*x_node->data());
+    elt_op->set_input_x2(*y_node->data());
+  } else {
+    LOG(WARNING) << "[NPU] Unsupported op type: " << op_type;
+    return FAILED;
+  }
 
-  // paddlelite has sum only
-  elementwise_node->set_attr_mode(1);
+  if (out_dims.Vectorize() != x_new_shape) {
+    auto reshaped_elt_node = graph->Add<ge::op::Reshape>(out_name);
+    auto reshaped_elt_op = reshaped_elt_node->data<ge::op::Reshape>();
+    reshaped_elt_op->set_input_tensor(*elt_node->data());
+    auto out_shape = out_dims.Vectorize();
+    reshaped_elt_op->set_attr_shape(
+        ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+    reshaped_elt_op->set_attr_axis(0);
+    elt_node = reshaped_elt_node;
+  }
 
-  node_map_type outputs_map;
-  if (op_type == "fusion_elementwise_add_activation") {
+  // Act node
+  if (op_type == "fusion_elementwise_add_activation" ||
+      op_type == "fusion_elementwise_sub_activation" ||
+      op_type == "fusion_elementwise_mul_activation" ||
+      op_type == "fusion_elementwise_div_activation") {
     auto act_type = op_info->GetAttr<std::string>("act_type");
-    auto act_node =
-        std::make_shared<ge::op::Activation>(unique_op_type + "/act");
-    act_node->set_input_x(*elementwise_node);
+    auto act_node = graph->Add<ge::op::Activation>(out_name);
+    auto act_op = act_node->data<ge::op::Activation>();
+    act_op->set_input_x(*elt_node->data());
     // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
     // clipped_relu etc.
-    act_node->set_attr_mode(lite::npu::CvtActMode(act_type));
-    lite::npu::OpList::Global().add(act_node);
-    outputs_map[op_info->Output("Out").front()] = act_node;
-  } else {
-    outputs_map[op_info->Output("Out").front()] = elementwise_node;
+    act_op->set_attr_mode(CvtActMode(act_type));
   }
-  return outputs_map;
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(elementwise_add,
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
-REGISTER_NPU_BRIDGE(fusion_elementwise_add_activation,
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_div,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_sub_activation,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_mul_activation,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_div_activation,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
diff --git a/lite/kernels/npu/bridges/elementwise_ops_test.cc b/lite/kernels/npu/bridges/elementwise_ops_test.cc
deleted file mode 100644
index 0e2fc9f2622d839c8eda6f82aab2759053b3e23d..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/elementwise_ops_test.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/elementwise_ops.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename dtype>
-void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-
-  auto x_data = x->data<dtype>();
-  auto y_data = y->data<dtype>();
-  dtype* out_data = out->mutable_data<dtype>();
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  int axis = op_info->GetAttr<int>("axis");
-
-  if (axis < 0) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  int batch = 1;
-  int channels = 1;
-  int num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    num *= x_dims[i];
-  }
-  // do elementwise add/sub/max...
-  std::string elt_type = "add";
-  if (elt_type == "add") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr + diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (elt_type == "sub") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr - diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (elt_type == "mul") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr * diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (elt_type == "max") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = std::max(*din_ptr, diny_data);
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else {
-    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
-  }
-}
-
-void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string y_var_name = "y";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  y->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-  FillTensor<float>(y);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("elementwise_add");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetInput("Y", {y_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  elementwise_add_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-1);
-  }
-}
-
-TEST(NPUBridges, elementwise_add) {
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto axis : {-1}) test_elementwise_add(bs, ic, ih, iw, axis);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(elementwise_add);
-USE_NPU_BRIDGE(elementwise_add);
diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..546a235148420e26d746ff730e22b2170e301cd6
--- /dev/null
+++ b/lite/kernels/npu/bridges/engine.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/engine.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+
+int Engine::BuildDeviceProgram() { return FAILED; }
+
+int Engine::LaunchDeviceProgram() { return 0; }
+
+int Engine::BuildOriginProgram() {
+  // TODO(hong19860320) The block_desc need to be divided into subgraphs during
+  // the exection time. But only see them as a subgraph now.
+  origin_program_.clear();
+  for (int op_idx = 0; op_idx < block_desc_->OpsSize(); op_idx++) {
+    auto op_desc = block_desc_->GetOp<cpp::OpDesc>(op_idx);
+    CHECK(op_desc);
+    std::string op_type = op_desc->Type();
+    auto op = LiteOpRegistry::Global().Create(op_desc->Type());
+    op->Attach(*op_desc, scope_);
+    std::unique_ptr<KernelBase> picked_kernel;
+    if (op_desc->HasAttr(kKernelTypeAttr)) {
+      // Create op and pick up kernel according to the kKernelTypeAttr attribute
+      auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr);
+      std::string alias;
+      Place place;
+      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
+      VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type
+              << " for " << op_type;
+      auto kernels = op->CreateKernels({place});
+      CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
+      auto it = std::find_if(
+          kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
+            return it->alias() == alias;
+          });
+      CHECK(it != kernels.end());
+      picked_kernel = std::move(*it);
+    } else {
+      VLOG(3) << "The attr '" << kKernelTypeAttr
+              << "' not found, pick the first kernel for " << op_type;
+#if defined(LITE_WITH_ARM)
+      auto kernels =
+          op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}});
+#elif defined(LITE_WITH_X86)
+      auto kernels =
+          op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}});
+#endif
+      CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
+      picked_kernel = std::move(kernels.front());
+    }
+    picked_kernel->SetContext(
+        ContextScheduler::Global().NewContext(picked_kernel->target()));
+    origin_program_.emplace_back(std::move(op), std::move(picked_kernel));
+  }
+  return 0;
+}
+
+int Engine::LaunchOriginProgram() {
+  for (auto& inst : origin_program_) {
+    auto op_type = inst.op()->op_info()->Type();
+    if (op_type == "feed" || op_type == "fetch") continue;
+    inst.Run();
+  }
+  return 0;
+}
+
+int Engine::Build() {
+  // In order to attach all of the ops of the block desc, we need to build the
+  // original program firstly.
+  BuildOriginProgram();
+  // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
+  build_device_program_status_ = BuildDeviceProgram();
+  return build_device_program_status_;
+}
+
+bool Engine::InputShapeChanged() {
+  for (int i = 0; i < origin_itensors_.size(); i++) {
+    if (origin_itensors_[i]->dims() != origin_idims_[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+int Engine::Launch() {
+  // Rebuild device program when the shapes of input tensors have been changed.
+  if (CHECK_SUCCESS(build_device_program_status_) &&
+      CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) &&
+      InputShapeChanged()) {
+    Build();
+  }
+  if (CHECK_FAILED(build_device_program_status_)) {
+    LaunchOriginProgram();
+  } else {
+    LaunchDeviceProgram();
+  }
+  return 0;
+}
+
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/npu/bridges/engine.h b/lite/kernels/npu/bridges/engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..61a4e12cf3ad6e3eab608a585f165fde9dec081d
--- /dev/null
+++ b/lite/kernels/npu/bridges/engine.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/program.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+
+class Engine {
+ public:
+  Engine(KernelContext *ctx,
+         int block_idx,
+         cpp::BlockDesc *block_desc,
+         const std::vector<std::string> &input_names,
+         const std::vector<std::string> &output_names,
+         lite::Scope *scope)
+      : ctx_(ctx),
+        block_idx_(block_idx),
+        block_desc_(block_desc),
+        input_names_(input_names),
+        output_names_(output_names),
+        scope_(scope) {}
+  virtual ~Engine() = default;
+
+  virtual int Build();
+  virtual int Launch();
+
+ private:
+  Engine(const Engine &) = delete;
+
+ protected:
+  virtual int BuildDeviceProgram();
+  virtual int LaunchDeviceProgram();
+
+  virtual int BuildOriginProgram();
+  virtual int LaunchOriginProgram();
+
+  virtual bool InputShapeChanged();
+
+  KernelContext *ctx_{nullptr};
+  int block_idx_;
+  cpp::BlockDesc *block_desc_;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
+  Scope *scope_{nullptr};
+  // SUCCESS: device program build successed. FAILED: device program build
+  // failed. REBUILD_WHEN_SHAPE_CHANGED: device program build successed but need
+  // to rebuild when input shape changed.
+  int build_device_program_status_{0};
+  std::vector<DDim> origin_idims_;
+  std::vector<DDim> origin_odims_;
+  std::vector<Tensor *> origin_itensors_;
+  std::vector<Tensor *> origin_otensors_;
+  std::vector<Instruction> origin_program_;
+};
+
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/npu/bridges/fc_op.cc b/lite/kernels/npu/bridges/fc_op.cc
index aca72265206818aabd3fd666a422f5764b52de10..d9d42cd8c73a321449649bca658333fdd5f57325 100644
--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
@@ -12,109 +12,119 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op,
-                          const node_map_type& inputs_map) {
-  auto scope = fc_op->scope();
-  auto op_info = fc_op->op_info();
+int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
-
-  auto fc_node = std::make_shared<ge::op::FullConnection>(unique_op_type);
-
-  auto x_var_name = op_info->Input("Input").front();
-  auto w_var_name = op_info->Input("W").front();
-
-  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto w = scope->FindVar(w_var_name)->GetMutable<lite::Tensor>();
-  auto x_dims = x->dims();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  auto input_name = op_info->Input("Input").front();
+  auto input_type = kernel->GetInputDeclType("Input");
+  CHECK(input_type->precision() == PRECISION(kFloat));
+  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
+  auto input = scope->FindTensor(input_name);
+  auto input_dims = input->dims();
+
+  auto w_name = op_info->Input("W").front();
+  auto w_type = kernel->GetInputDeclType("W");
+  CHECK(w_type->precision() == PRECISION(kFloat));
+  CHECK(w_type->layout() == DATALAYOUT(kNCHW));
+  auto w = scope->FindTensor(w_name);
   auto w_dims = w->dims();
-
-  CHECK_GE(x_dims.size(), 2UL);
   CHECK_EQ(w_dims.size(), 2UL);
 
-  int m = x_dims.Slice(0, in_num_col_dims).production();
-  int k = x_dims.Slice(in_num_col_dims, x_dims.size()).production();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindTensor(out_name);
+  auto out_dims = out->dims();
+
+  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  int m = input_dims.Slice(0, in_num_col_dims).production();
+  int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
   int n = w_dims[1];
   CHECK_EQ(k * n, w_dims.production());
-  VLOG(3) << "[NPU] x dims: " << x_dims << " w dims: " << w_dims << " m: " << m
-          << " k: " << k << " n: " << n;
 
-  CHECK(inputs_map.count(x_var_name));
-  CHECK(!inputs_map.count(w_var_name));
-
-  // reshape x to (m, k, 1, 1)
-  auto reshaped_x_node =
-      std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
-  reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name));
-  reshaped_x_node->set_attr_shape({m, k, 1, 1});
-  reshaped_x_node->set_attr_axis(0);
-  fc_node->set_input_x(*reshaped_x_node);
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(reshaped_x_node);
-
-  // create w const node, set its shape to (k, n, 1, 1) and fill with
+  // Create input node and reshape it to (m, k, 1, 1)
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+  auto reshaped_input_node =
+      graph->Add<ge::op::Reshape>(input_name + "/reshape");
+  auto reshaped_input_op = reshaped_input_node->data<ge::op::Reshape>();
+  reshaped_input_op->set_input_tensor(*input_node->data());
+  reshaped_input_op->set_attr_shape({m, k, 1, 1});
+  reshaped_input_op->set_attr_axis(0);
+
+  // Create w const node, set its shape to (n, k, 1, 1) and fill with
   // the transposed w tensor
-  auto w_const_node = std::make_shared<ge::op::Const>(w_var_name);
-  ge::TensorDesc w_const_desc(
-      ge::Shape({n, k, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-  ge::TensorPtr w_const_tensor = std::make_shared<ge::Tensor>();
-  w_const_tensor->SetTensorDesc(w_const_desc);
-  auto w_data = w->mutable_data<float>();
-  std::vector<float> transposed_w_data(w_dims.production());
+  Tensor transpose_w;
+  transpose_w.Resize({n, k, 1, 1});
+  transpose_w.set_persistable(true);
+  auto transpose_w_data = transpose_w.mutable_data<float>();
+  auto w_data = w->data<float>();
   for (int i = 0; i < k; i++) {
     for (int j = 0; j < n; j++) {
-      transposed_w_data[j * k + i] = w_data[i * n + j];
+      transpose_w_data[j * k + i] = w_data[i * n + j];
     }
   }
-  w_const_tensor->SetData(reinterpret_cast<uint8_t*>(transposed_w_data.data()),
-                          transposed_w_data.size() * sizeof(float));
-  w_const_node->set_attr_value(w_const_tensor);
-  fc_node->set_input_w(*w_const_node);
-  lite::npu::OpList::Global().add(w_const_node);
-
-  // add bias node if bias tensor exists
-  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
-    auto bias_var_name = op_info->Input("Bias").front();
-    auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-    auto bias_dims = bias->dims();
-    CHECK(!inputs_map.count(bias_var_name));
-    CHECK_EQ(bias_dims.production(), n);
-
-    auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
-    bias_const_node->set_attr_value(lite::npu::CvtTensor(bias, {1, n, 1, 1}));
-    fc_node->set_input_b(*bias_const_node);
-    lite::npu::OpList::Global().add(bias_const_node);
+  auto trans_w_node = graph->Add(w_name, transpose_w);
+
+  // FC node
+  auto fc_node = graph->Add<ge::op::FullConnection>(out_name);
+  auto fc_op = fc_node->data<ge::op::FullConnection>();
+  fc_op->set_input_x(*reshaped_input_node->data());
+  fc_op->set_input_w(*trans_w_node->data());
+
+  // Add bias node if bias tensor exists
+  if (HasInputArg(op_info, scope, "Bias")) {
+    std::shared_ptr<Node> bias_node = nullptr;
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias_type = kernel->GetInputDeclType("Bias");
+      CHECK(bias_type->precision() == PRECISION(kFloat));
+      CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+      auto bias = scope->FindTensor(bias_name);
+      auto bias_dims = bias->dims();
+      CHECK_EQ(bias_dims.production(), n);
+      bias_node = graph->Add(bias_name, *bias, {1, n, 1, 1});
+    }
+    fc_op->set_input_b(*bias_node->data());
   }
-  lite::npu::OpList::Global().add(fc_node);
 
-  // reshape output of fc_node from (m, n, 1, 1) to (m, n)
-  auto reshaped_fc_node =
-      std::make_shared<ge::op::Reshape>(unique_op_type + "_reshape");
-  reshaped_fc_node->set_input_tensor(*fc_node);
-  reshaped_fc_node->set_attr_shape({m, n});
-  reshaped_fc_node->set_attr_axis(0);
-  lite::npu::OpList::Global().add(reshaped_fc_node);
+  // Reshape output of FC node from (m, n, 1, 1) to out_shape
+  auto reshaped_fc_node = graph->Add<ge::op::Reshape>(out_name);
+  auto reshaped_fc_op = reshaped_fc_node->data<ge::op::Reshape>();
+  reshaped_fc_op->set_input_tensor(*fc_node->data());
+  auto out_shape = out_dims.Vectorize();
+  reshaped_fc_op->set_attr_shape(
+      ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+  reshaped_fc_op->set_attr_axis(0);
 
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = reshaped_fc_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(fc, paddle::lite::kernels::npu::bridges::FCConverter);
+REGISTER_SUBGRAPH_BRIDGE(fc, kNPU, paddle::lite::subgraph::npu::FCConverter);
diff --git a/lite/kernels/npu/bridges/fc_op_test.cc b/lite/kernels/npu/bridges/fc_op_test.cc
deleted file mode 100644
index 77015236e2eed847d0ec0ea5c06e646e5893f29a..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/fc_op_test.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/fc_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto input =
-      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
-  auto w = scope->FindVar(op_info->Input("W").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  int32_t in_num_col_dims = op_info->GetAttr<int32_t>("in_num_col_dims");
-  Tensor* bias = nullptr;
-  float* bias_data = nullptr;
-  if (op_info->HasInput("Bias")) {
-    auto bias_var_names = op_info->Input("Bias");
-    if (bias_var_names.size() > 0) {
-      auto bias_var_name = bias_var_names.front();
-      bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-      bias_data = bias->mutable_data<float>();
-    }
-  }
-  auto input_data = input->data<float>();
-  auto w_data = w->mutable_data<float>();
-  auto out_data = out->mutable_data<float>();
-  auto in_mat_dims = input->dims().Flatten2D(in_num_col_dims);
-  int out_num_classes = w->dims()[1];
-  const int M = in_mat_dims[0];
-  const int K = in_mat_dims[1];
-  const int N = out_num_classes;
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      out_data[m * N + n] = 0;
-      for (int k = 0; k < K; ++k) {
-        out_data[m * N + n] += input_data[m * K + k] * w_data[k * N + n];
-      }
-    }
-  }
-  if (bias_data != nullptr) {
-    for (int m = 0; m < M; ++m) {
-      for (int n = 0; n < N; ++n) {
-        out_data[m * N + n] += bias_data[n];
-      }
-    }
-  }
-}
-
-void test_fc(const std::vector<int64_t>& input_shape,
-             const std::vector<int64_t>& w_shape,
-             int in_num_col_dims,
-             bool has_bias) {
-  CHECK_EQ(w_shape.size(), 2UL);
-
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  CHECK(bridges.HasType("fc"));
-
-  Scope scope;
-  std::string input_var_name("Input");
-  std::string w_var_name("W");
-  std::string bias_var_name("Bias");
-  std::string out_var_name("Out");
-  std::string out_ref_var_name("out_ref");
-  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
-  auto* w = scope.Var(w_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  input->Resize(input_shape);
-  w->Resize(w_shape);
-
-  FillTensor<float, int>(input);
-  FillTensor<float, int>(w);
-
-  // create fc op
-  cpp::OpDesc fc_op_desc;
-  fc_op_desc.SetType("fc");
-  fc_op_desc.SetInput("Input", {input_var_name});
-  fc_op_desc.SetInput("W", {w_var_name});
-  fc_op_desc.SetOutput("Out", {out_var_name});
-  fc_op_desc.SetAttr("in_num_col_dims", static_cast<int>(in_num_col_dims));
-  if (has_bias) {
-    auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-    bias->Resize({w_shape[1]});
-    FillTensor<float, int>(bias);
-    fc_op_desc.SetInput("Bias", {bias_var_name});
-  }
-
-  auto fc_op = CreateOp<operators::FcOpLite>(fc_op_desc, &scope);
-  LauchOp(fc_op, {input_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // compare results
-  fc_ref(fc_op);
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, fc) {
-  for (bool use_bias : {true, false}) {
-    test_fc({1, 8, 8, 1}, {8, 4}, 2, use_bias);
-    test_fc({1, 5, 5, 1}, {5, 7}, 2, use_bias);
-    test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
-    test_fc({1, 1024, 1, 1}, {1024, 1000}, 1, use_bias);
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(fc);
-USE_NPU_BRIDGE(fc);
diff --git a/lite/kernels/npu/bridges/graph.cc b/lite/kernels/npu/bridges/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d3afd92bfb645b4914346be155abe80026e42ef
--- /dev/null
+++ b/lite/kernels/npu/bridges/graph.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    // Only variable node can be shared with the same name
+    if (!node->is_var() || !it->second.back()->is_var()) {
+      LOG(FATAL) << "[NPU] Const or data node " << name << " is redefined.";
+      return -1;
+    }
+  } else {
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  it->second.push_back(node);
+  return it->second.size();
+}
+
+// Const or data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const Tensor& tensor,
+                                 std::vector<int64_t> shape,
+                                 DataLayoutType layout) {
+  std::shared_ptr<Node> node = nullptr;
+  PrecisionType precision = tensor.precision();
+  if (tensor.persistable()) {
+    // Const node
+    node = Add<ge::op::Const>(name, precision, layout);
+    node->data<ge::op::Const>()->set_attr_value(
+        CvtTensor(tensor, shape, layout));
+  } else {
+    // Data node
+    node = Add(name, shape, precision, layout);
+  }
+  return node;
+}
+
+// Data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout) {
+  auto node = Add<ge::op::Data>(name, precision, layout);
+  ge::TensorDesc desc(
+      ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision));
+  node->data<ge::op::Data>()->update_input_desc_x(desc);
+  return node;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc4a7e2a7ce062090ca890d90e21aa643e37a0d3
--- /dev/null
+++ b/lite/kernels/npu/bridges/graph.h
@@ -0,0 +1,195 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "graph/op/all_ops.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+// Graph and node is defined to collect all of converted HiAI IR nodes
+class Node {
+ public:
+  enum class Role {
+    kVar = 0,
+    kConst,
+    kData,
+  };
+
+  Node(std::shared_ptr<ge::Operator> data,
+       PrecisionType precision,
+       DataLayoutType layout,
+       Role role)
+      : data_(data), precision_(precision), layout_(layout), role_(role) {}
+  Node(PrecisionType precision, DataLayoutType layout, Role role)
+      : precision_(precision), layout_(layout), role_(role) {}
+
+  void set_data(std::shared_ptr<ge::Operator> data) { data_ = data; }
+  void set_precision(PrecisionType precision) { precision_ = precision; }
+  void set_layout(DataLayoutType layout) { layout_ = layout; }
+  void set_role(Role role) { role_ = role; }
+
+  template <typename T>
+  std::shared_ptr<T> data() {
+    return std::static_pointer_cast<T>(data_);
+  }
+  std::shared_ptr<ge::Operator> data() { return data_; }
+  PrecisionType precision() const { return precision_; }
+  DataLayoutType layout() const { return layout_; }
+  bool is_var() const { return role_ == Role::kVar; }
+  bool is_const() const { return role_ == Role::kConst; }
+  bool is_data() const { return role_ == Role::kData; }
+
+ private:
+  std::shared_ptr<ge::Operator> data_{nullptr};
+  PrecisionType precision_{PRECISION(kFloat)};
+  DataLayoutType layout_{DATALAYOUT(kNCHW)};
+  Role role_{Role::kVar};
+};
+
+class Graph {
+ public:
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Variable, const or data node
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    Node::Role role = Node::Role::kVar;
+    if (typeid(T) == typeid(ge::op::Const)) {
+      role = Node::Role::kConst;
+    } else if (typeid(T) == typeid(ge::op::Data)) {
+      role = Node::Role::kData;
+    }
+    auto node = std::make_shared<Node>(precision, layout, role);
+    auto idx = Add(name, node);
+    CHECK_GE(idx, 1);
+    // Generate a unique name for the created HiAI IR
+    node->set_data(std::make_shared<T>(name + "__" + std::to_string(idx)));
+    return node;
+  }
+
+  // Const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            std::vector<int64_t> shape,
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, tensor.dims().Vectorize(), layout);
+  }
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, dims.Vectorize(), layout);
+  }
+
+  // Const node
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            std::vector<int64_t> shape = {},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    if (shape.empty()) {
+      shape = {static_cast<int64_t>(data.size())};
+    } else {
+      int size = 1;
+      for (auto i : shape) {
+        size *= i;
+      }
+      CHECK_EQ(data.size(), size);
+    }
+    Tensor tensor;
+    tensor.Resize(shape);
+    tensor.set_persistable(true);
+    std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
+                reinterpret_cast<const uint8_t*>(data.data()),
+                data.size() * sizeof(T));
+    return Add(name, tensor, layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, data, dims.Vectorize(), layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            std::vector<int64_t> shape = {1},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    int64_t size = 1;
+    for (auto i : shape) {
+      size *= i;
+    }
+    std::vector<T> data(size, value);
+    return Add(name, data, shape, layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, value, dims.Vectorize(), layout);
+  }
+
+  // Data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            DDim dims,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, dims.Vectorize(), precision, layout);
+  }
+
+  std::shared_ptr<Node> Get(std::string name) {
+    CHECK(Has(name)) << "[NPU] Node " << name << " not found.";
+    return nodes_.at(name).back();
+  }
+
+  bool Has(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+ private:
+  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
+};
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/npu/bridges/instance_norm_op.cc b/lite/kernels/npu/bridges/instance_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d71d17d8f164edf9daefe19162991726f677ce74
--- /dev/null
+++ b/lite/kernels/npu/bridges/instance_norm_op.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  CHECK_EQ(x_dims.size(), 4L);
+  auto batch_size = x_dims[0];
+  auto channel_size = x_dims[1];
+  auto spatial_size = x_dims[2] * x_dims[3];
+  DDim scale_bias_dims({1, channel_size, 1, 1});
+  auto y_name = op_info->Output("Y").front();
+  auto y_type = kernel->GetOutputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  float epsilon = op_info->GetAttr<float>("epsilon");
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Bias node
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias_type = kernel->GetInputDeclType("Bias");
+    CHECK(bias_type->precision() == PRECISION(kFloat));
+    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+    CHECK_EQ(channel_size, bias_dims.production());
+    if (spatial_size <= 1) {
+      // Bug exists in HiAI DDK when h=1 and w=1
+      auto bias_data = bias->mutable_data<float>();
+      Tensor y;
+      y.Resize(x_dims);
+      y.set_persistable(true);
+      auto y_data = y.mutable_data<float>();
+      for (int i = 0; i < batch_size; i++) {
+        std::memcpy(y_data, bias_data, sizeof(float) * channel_size);
+        y_data += channel_size;
+      }
+      graph->Add(y_name, y);
+      return SUCCESS;
+    } else {
+      if (!bias->persistable()) {
+        LOG(WARNING) << "[NPU] Only supporting persistable bias tensor.";
+        return FAILED;
+      }
+      bias_node = graph->Add(bias_name, *bias, scale_bias_dims);
+    }
+  } else {
+    if (spatial_size <= 1) {
+      // Bug exists in HiAI DDK when h=1 and w=1
+      graph->Add(y_name, 0.0f, x_dims);
+      return SUCCESS;
+    } else {
+      bias_node = graph->Add(y_name + "/bias", 0.0f, scale_bias_dims);
+    }
+  }
+
+  // Scale node
+  std::shared_ptr<Node> scale_node = nullptr;
+  if (HasInputArg(op_info, scope, "Scale")) {
+    auto scale_name = op_info->Input("Scale").front();
+    auto scale_type = kernel->GetInputDeclType("Scale");
+    CHECK(scale_type->precision() == PRECISION(kFloat));
+    CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
+    auto scale = scope->FindMutableTensor(scale_name);
+    auto scale_dims = scale->dims();
+    CHECK_EQ(channel_size, scale_dims.production());
+    if (!scale->persistable()) {
+      LOG(WARNING) << "[NPU] Only supporting persistable scale tensor.";
+      return FAILED;
+    }
+    scale_node = graph->Add(scale_name, *scale, scale_bias_dims);
+  } else {
+    scale_node = graph->Add(y_name + "/scale", 1.0f, scale_bias_dims);
+  }
+
+  // InstanceNorm node
+  auto instance_norm_node = graph->Add<ge::op::InstanceNorm>(y_name);
+  auto instance_norm_op = instance_norm_node->data<ge::op::InstanceNorm>();
+  instance_norm_op->set_input_x(*x_node->data());
+  instance_norm_op->set_input_scale(*scale_node->data());
+  instance_norm_op->set_input_bias(*bias_node->data());
+  instance_norm_op->set_attr_reduction_indices(ge::AttrValue::LIST_INT({2}));
+  instance_norm_op->set_attr_epsilon(epsilon);
+  return SUCCESS;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(instance_norm,
+                         kNPU,
+                         paddle::lite::subgraph::npu::InstanceNormConverter);
diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc
index 8e60a39fe4a32e8750cc161d3485314b42e1ab0c..d68f63b16e1187e85b1d7c4b69b628376dfa228d 100644
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
@@ -12,45 +12,61 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type InterpolateConverter(
-    const std::shared_ptr<lite::OpLite> interpolate_op,
-    const node_map_type& inputs_map) {
-  auto scope = interpolate_op->scope();
-  auto op_info = interpolate_op->op_info();
+int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  // get input, output and attributes from lite op
-  auto x_var_name = op_info->Input("X").front();
-  CHECK(inputs_map.count(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto x_h = x_dims[2];
   auto x_w = x_dims[3];
   CHECK_EQ(x_dims.size(), 4);
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto scale = op_info->GetAttr<float>("scale");
   auto out_w = op_info->GetAttr<int>("out_w");
   auto out_h = op_info->GetAttr<int>("out_h");
   auto align_corners = op_info->GetAttr<bool>("align_corners");
-  int align_mode = op_info->GetAttr<int>("align_mode");
+  int align_mode =
+      op_info->HasAttr("align_mode") ? op_info->GetAttr<int>("align_mode") : 1;
   auto interp_method = op_info->GetAttr<std::string>("interp_method");
-  CHECK(!(align_mode == 0 && !align_corners)) << "[NPU] align_mode = 0 && "
-                                                 "align_corners = false isn't "
-                                                 "supported in HiAI DDK";
+  if (align_mode == 0 && !align_corners) {
+    LOG(WARNING) << "[NPU] align_mode = 0 && "
+                    "align_corners = false isn't "
+                    "supported in HiAI DDK";
+    return FAILED;
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
 
-  // priority: OutSize > scale > out_h/out_w
+  // Priority: OutSize > scale > out_h/out_w
   if (scale > 0) {
     out_h = static_cast<int>(x_h * scale);
     out_w = static_cast<int>(x_w * scale);
@@ -58,18 +74,21 @@ node_map_type InterpolateConverter(
     out_w = out_w > 0 ? out_w : -1;
   }
 
-  // update out_h and out_w if has OutSize
-  std::shared_ptr<ge::Operator> out_size_node = nullptr;
-  if (lite::npu::HasInputArg(op_info, scope, "OutSize")) {
-    auto out_size_var_name = op_info->Input("OutSize").front();
-    if (inputs_map.count(out_size_var_name)) {
-      out_size_node = inputs_map.at(out_size_var_name);
+  // Update out_h and out_w and create out_size node if has OutSize
+  std::shared_ptr<Node> out_size_node = nullptr;
+  if (HasInputArg(op_info, scope, "OutSize")) {
+    auto out_size_name = op_info->Input("OutSize").front();
+    auto out_size_type = kernel->GetInputDeclType("OutSize");
+    CHECK(out_size_type->precision() == PRECISION(kInt32));
+    CHECK(out_size_type->layout() == DATALAYOUT(kNCHW));
+    if (graph->Has(out_size_name)) {
+      out_size_node = graph->Get(out_size_name);
     } else {
-      auto out_size =
-          scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
+      auto out_size = scope->FindMutableTensor(out_size_name);
       CHECK_EQ(out_size->numel(), 2);
+      CHECK(out_size->persistable());
       auto out_size_data = out_size->mutable_data<int>();
-      // update out_h and out_w if has OutSize
+      // Update out_h and out_w if has OutSize
       out_h = out_size_data[0];
       out_w = out_size_data[1];
     }
@@ -78,51 +97,47 @@ node_map_type InterpolateConverter(
     if (interp_method == "bilinear") {
       const float largest_multiple = 7.0f;
       float multiple = static_cast<float>(x_h * x_w) / (out_h * out_w);
-      CHECK_LT(multiple, largest_multiple)
-          << "[NPU] multiple=(ih*iw)/(oh*ow)=" << multiple
-          << " is too large, should not exceed " << largest_multiple
-          << " in HiAI DDK";
+      if (multiple >= largest_multiple) {
+        LOG(WARNING) << "[NPU] multiple=(ih*iw)/(oh*ow)=" << multiple
+                     << " is too large, should not exceed " << largest_multiple
+                     << " in HiAI DDK";
+        return FAILED;
+      }
     }
-    auto out_size_const_node =
-        std::make_shared<ge::op::Const>(unique_op_type + "/out_size");
-    out_size_const_node->set_attr_value(
-        lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
-    out_size_node = out_size_const_node;
+    out_size_node =
+        graph->Add(out_name + "/out_size", std::vector<int>({out_h, out_w}));
   }
-  lite::npu::OpList::Global().add(out_size_node);
 
-  std::shared_ptr<ge::Operator> interp_node = nullptr;
   if (interp_method == "bilinear") {
-    auto bilinear_interp_node =
-        std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
-    bilinear_interp_node->set_input_x(*inputs_map.at(x_var_name));
-    bilinear_interp_node->set_input_size(*out_size_node);
-    bilinear_interp_node->set_attr_align_corners(align_corners);
-    interp_node = bilinear_interp_node;
+    auto bilinear_interp_node = graph->Add<ge::op::ResizeBilinear>(out_name);
+    auto bilinear_interp_op =
+        bilinear_interp_node->data<ge::op::ResizeBilinear>();
+    bilinear_interp_op->set_input_x(*x_node->data());
+    bilinear_interp_op->set_input_size(*out_size_node->data());
+    bilinear_interp_op->set_attr_align_corners(align_corners);
   } else if (interp_method == "nearest") {
     auto nearest_interp_node =
-        std::make_shared<ge::op::ResizeNearestNeighbor>(unique_op_type);
-    nearest_interp_node->set_input_image(*inputs_map.at(x_var_name));
-    nearest_interp_node->set_input_size(*out_size_node);
-    nearest_interp_node->set_attr_align_corners(align_corners);
-    interp_node = nearest_interp_node;
+        graph->Add<ge::op::ResizeNearestNeighbor>(out_name);
+    auto nearest_interp_op =
+        nearest_interp_node->data<ge::op::ResizeNearestNeighbor>();
+    nearest_interp_op->set_input_image(*x_node->data());
+    nearest_interp_op->set_input_size(*out_size_node->data());
+    nearest_interp_op->set_attr_align_corners(align_corners);
   } else {
-    LOG(FATAL) << "[NPU] Unsupported interpolate method: " << interp_method;
+    LOG(WARNING) << "[NPU] Unsupported interpolate method: " << interp_method;
+    return FAILED;
   }
-  lite::npu::OpList::Global().add(interp_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = interp_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(bilinear_interp,
-                    paddle::lite::kernels::npu::bridges::InterpolateConverter);
-REGISTER_NPU_BRIDGE(nearest_interp,
-                    paddle::lite::kernels::npu::bridges::InterpolateConverter);
+REGISTER_SUBGRAPH_BRIDGE(bilinear_interp,
+                         kNPU,
+                         paddle::lite::subgraph::npu::InterpolateConverter);
+REGISTER_SUBGRAPH_BRIDGE(nearest_interp,
+                         kNPU,
+                         paddle::lite::subgraph::npu::InterpolateConverter);
diff --git a/lite/kernels/npu/bridges/interpolate_op_test.cc b/lite/kernels/npu/bridges/interpolate_op_test.cc
deleted file mode 100644
index c061fbfe5ff2741bff0ca7427519a37e14606899..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/interpolate_op_test.cc
+++ /dev/null
@@ -1,407 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/interpolate_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename DType>
-void bilinear_interp_ref(const std::shared_ptr<operators::InterpolateOp> op) {
-  auto scope = op->scope();
-  auto op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = x->dims();
-  int batch_size = x_dims[0];
-  int channel_size = x_dims[1];
-  auto x_h = x_dims[2];
-  auto x_w = x_dims[3];
-  CHECK_EQ(x_dims.size(), 4);
-  auto scale = op_info->GetAttr<float>("scale");
-  auto out_w = op_info->GetAttr<int>("out_w");
-  auto out_h = op_info->GetAttr<int>("out_h");
-  auto align_corners = op_info->GetAttr<bool>("align_corners");
-  int align_mode = op_info->GetAttr<int>("align_mode");
-  auto interp_method = op_info->GetAttr<std::string>("interp_method");
-
-  // calc real out_h and out_w
-  if (scale > 0) {
-    out_h = static_cast<int>(x_h * scale);
-    out_w = static_cast<int>(x_w * scale);
-  }
-  if (op_info->HasInput("OutSize")) {
-    auto out_size_var_names = op_info->Input("OutSize");
-    if (out_size_var_names.size() > 0) {
-      auto out_size_var_name = out_size_var_names.front();
-      auto out_size =
-          scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
-      auto out_size_dims = out_size->dims();
-      CHECK_EQ(out_size_dims.size(), 1);
-      CHECK_EQ(out_size_dims.production(), 2);
-      auto out_size_data = out_size->mutable_data<int>();
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-  }
-  CHECK_GT(out_h, 0);
-  CHECK_GT(out_w, 0);
-  out->Resize({batch_size, channel_size, out_h, out_w});
-
-  // copy from x if no change
-  if (x_h == out_h && x_w == out_w) {
-    out->CopyDataFrom(*x);
-    return;
-  }
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(x_h - 1) / (out_h - 1)
-                              : static_cast<float>(x_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(x_w - 1) / (out_w - 1)
-                              : static_cast<float>(x_w) / out_w;
-  }
-
-  // naive bilinear interpolation
-  auto x_data = x->mutable_data<DType>();
-  auto out_data = out->mutable_data<DType>();
-  bool align_flag = (align_mode == 0 && !align_corners);
-
-  std::vector<int> vy_n, vy_s;
-  std::vector<float> vd_n, vd_s;
-  vy_n.reserve(out_h);
-  vy_s.reserve(out_h);
-  vd_n.reserve(out_h);
-  vd_s.reserve(out_h);
-  for (int k = 0; k < out_h; k++) {
-    int yn = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                        : static_cast<int>(ratio_h * k);
-    yn = (yn > 0) ? yn : 0;
-    int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1);
-    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
-    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-    float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn;
-    float ds = 1.f - dn;
-    {
-      vy_n[k] = yn;
-      vy_s[k] = ys;
-      vd_n[k] = dn;
-      vd_s[k] = ds;
-    }
-  }
-
-  std::vector<int> vx_w, vx_e;
-  std::vector<float> vd_w, vd_e;
-  vx_w.reserve(out_w);
-  vx_e.reserve(out_w);
-  vd_w.reserve(out_w);
-  vd_e.reserve(out_w);
-  for (int l = 0; l < out_w; l++) {
-    int xw = (align_mode == 0 && !align_corners)
-                 ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                 : static_cast<int>(ratio_w * l);
-    xw = (xw > 0) ? xw : 0;
-    int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1);
-    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
-    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-    float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw;
-    float de = 1.f - dw;
-    {
-      vx_w[l] = xw;
-      vx_e[l] = xe;
-      vd_w[l] = dw;
-      vd_e[l] = de;
-    }
-  }
-
-  std::vector<int64_t> x_strides(x_dims.size(), 1);
-  for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
-    x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
-  }
-  for (int i = 0; i < batch_size; i++) {
-    for (int j = 0; j < channel_size; j++) {
-      for (int k = 0; k < out_h; k++) {
-        for (int l = 0; l < out_w; l++) {
-          DType x0 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]];
-          DType x1 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]];
-          DType x2 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]];
-          DType x3 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]];
-          *out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] +
-                      x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l];
-          out_data++;
-        }
-      }
-    }
-  }
-}
-
-template <typename DType>
-void nearest_interp_ref(const std::shared_ptr<operators::InterpolateOp> op) {
-  auto scope = op->scope();
-  auto op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = x->dims();
-  CHECK_EQ(x_dims.size(), 4);
-  auto scale = op_info->GetAttr<float>("scale");
-  auto out_w = op_info->GetAttr<int>("out_w");
-  auto out_h = op_info->GetAttr<int>("out_h");
-  auto align_corners = op_info->GetAttr<bool>("align_corners");
-  // int align_mode = op_info->GetAttr<int>("align_mode");
-  auto interp_method = op_info->GetAttr<std::string>("interp_method");
-  CHECK_EQ(interp_method, "nearest");
-
-  int x_h = x_dims[2];
-  int x_w = x_dims[3];
-  if (scale > 0) {
-    out_h = static_cast<int>(x_h * scale);
-    out_w = static_cast<int>(x_w * scale);
-  }
-  if (op_info->HasInput("OutSize")) {
-    auto out_size_var_names = op_info->Input("OutSize");
-    if (out_size_var_names.size() > 0) {
-      auto out_size_var_name = out_size_var_names.front();
-      auto out_size =
-          scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
-      CHECK_EQ(out_size->numel(), 2);
-      auto out_size_data = out_size->mutable_data<int>();
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-  }
-  CHECK_GT(out_h, 0);
-  CHECK_GT(out_w, 0);
-  out->Resize({x_dims[0], x_dims[1], out_h, out_w});
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = align_corners ? static_cast<float>(x_h - 1.0) / (out_h - 1.0)
-                            : static_cast<float>(x_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = align_corners ? static_cast<float>(x_w - 1.0) / (out_w - 1.0)
-                            : static_cast<float>(x_w) / out_w;
-  }
-
-  auto x_data = x->data<DType>();
-  auto out_data = out->mutable_data<DType>();
-  auto out_dims = out->dims();
-  std::vector<int64_t> x_strides(x_dims.size(), 1);
-  for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
-    x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
-  }
-
-  for (int n = 0; n < out_dims[0]; n++) {
-    for (int c = 0; c < out_dims[1]; c++) {
-      for (int h = 0; h < out_dims[2]; h++) {
-        for (int w = 0; w < out_dims[3]; w++) {
-          int in_i = ratio_h * h;
-          int in_j = ratio_w * w;
-          if (align_corners) {
-            in_i = ratio_h * h + 0.5;
-            in_j = ratio_w * w + 0.5;
-          }
-          *out_data = x_data[n * x_strides[0] + c * x_strides[1] +
-                             in_i * x_strides[2] + in_j * x_strides[3]];
-          out_data++;
-        }
-      }
-    }
-  }
-}
-
-void test_interpolate(int bs,
-                      int ic,
-                      int ih,
-                      int iw,
-                      int oh,
-                      int ow,
-                      float scale,
-                      int out_size_h,
-                      int out_size_w,
-                      bool align_corners,
-                      int align_mode,
-                      std::string interp_method) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string out_size_var_name("out_size");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  auto x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto out_size = scope.Var(out_size_var_name)->GetMutable<Tensor>();
-  auto out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  out_size->Resize({2});
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType(interp_method + "_interp");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("out_h", oh);
-  opdesc.SetAttr("out_w", ow);
-  opdesc.SetAttr("scale", scale);
-  opdesc.SetAttr("align_corners", static_cast<bool>(align_corners));
-  opdesc.SetAttr("align_mode", static_cast<int>(align_mode));
-  opdesc.SetAttr("interp_method", interp_method);
-  if (out_size_h > 0 && out_size_w > 0) {
-    auto out_size_dims = out_size->dims();
-    CHECK_EQ(out_size_dims.size(), 1);
-    CHECK_EQ(out_size_dims.production(), 2);
-    auto out_size_data = out_size->mutable_data<int>();
-    out_size_data[0] = out_size_h;
-    out_size_data[1] = out_size_w;
-    opdesc.SetInput("OutSize", {out_size_var_name});
-  }
-
-  // create op and execute reference implementation
-  auto op = CreateOp<operators::InterpolateOp>(opdesc, &scope);
-  if (interp_method == "bilinear") {
-    bilinear_interp_ref<float>(op);
-  } else {
-    nearest_interp_ref<float>(op);
-  }
-  out_ref->CopyDataFrom(*out);
-
-  // convert op to NPU model, then run it on NPU
-  LauchOp(op, {x_var_name}, {out_var_name});
-
-  // compare results
-  auto out_dims = out->dims();
-  auto out_ref_dims = out_ref->dims();
-  CHECK_EQ(out_dims.size(), out_ref_dims.size());
-  for (int i = 0; i < out_dims.size(); i++) {
-    CHECK_EQ(out_dims[i], out_ref_dims[i]);
-  }
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2f);
-  }
-}
-
-TEST(NPUBridges, bilinear_interp) {
-#if 1
-  for (auto bs : {1, 3}) {
-    for (auto ic : {3, 4}) {
-      for (auto ih : {4, 5}) {
-        for (auto iw : {3, 6}) {
-          for (auto oh : {0, 3, 8}) {
-            for (auto ow : {0, 4, 9}) {
-              for (auto scale : {0.f, 0.5f, 0.6f, 2.0f, 2.2f}) {
-                for (auto out_size_h : {0, 3, 11}) {
-                  for (auto out_size_w : {0, 2, 12}) {
-                    for (auto align_corners : {true, false}) {
-                      for (auto align_mode : {0, 1}) {
-                        for (auto interp_method : {"bilinear", "nearest"}) {
-                          int act_oh = 0, act_ow = 0;
-                          if (out_size_h > 0 && out_size_w > 0) {
-                            act_oh = out_size_h;
-                            act_ow = out_size_w;
-                          } else if (scale > 1e-5) {
-                            act_oh = static_cast<int>(ih * scale);
-                            act_ow = static_cast<int>(iw * scale);
-                          } else if (oh > 0 && ow > 0) {
-                            act_oh = oh;
-                            act_ow = ow;
-                          }
-                          if (act_oh <= 0 || act_ow <= 0) {
-                            continue;
-                          }
-                          // TODO(hong19860320) multiple=(ih*iw)/(oh*ow)
-                          // should
-                          // not exceed 7.0 in NPU DDK, delete the following
-                          // lines
-                          // if the limination is removed.
-                          const float largest_multiple = 7.0f;
-                          float multiple =
-                              static_cast<float>(ih * iw) / (act_oh * act_ow);
-                          if (multiple > largest_multiple) {
-                            continue;
-                          }
-                          if (align_mode == 0 && !align_corners) {
-                            continue;
-                          }
-                          VLOG(3) << "bs: " << bs << " ic: " << ic
-                                  << " ih: " << ih << " iw: " << iw
-                                  << " oh: " << oh << " ow: " << ow
-                                  << " scale: " << scale
-                                  << " out_size: " << out_size_h << ","
-                                  << out_size_w
-                                  << " align_corners: " << align_corners
-                                  << " align_mode: " << align_mode;
-                          test_interpolate(bs,
-                                           ic,
-                                           ih,
-                                           iw,
-                                           oh,
-                                           ow,
-                                           scale,
-                                           out_size_h,
-                                           out_size_w,
-                                           align_corners,
-                                           align_mode,
-                                           interp_method);
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#else
-  test_interpolate(1, 1, 4, 3, 0, 0, 1.f, 3, 6, false, 1, "nearest");
-#endif
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(bilinear_interp);
-USE_NPU_BRIDGE(bilinear_interp);
-
-USE_LITE_OP(nearest_interp);
-USE_NPU_BRIDGE(nearest_interp);
diff --git a/lite/kernels/npu/bridges/layer_norm_op.cc b/lite/kernels/npu/bridges/layer_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad32d69d3c40df49ae155b397803cab65ec43dc9
--- /dev/null
+++ b/lite/kernels/npu/bridges/layer_norm_op.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto padded_x_shape = CvtShape(x_dims);
+  auto x_rank = static_cast<int>(x_dims.size());
+  CHECK(x_rank >= 2 && x_rank <= 4);
+
+  auto y_name = op_info->Output("Y").front();
+  auto y_type = kernel->GetOutputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+  auto padded_y_shape = CvtShape(y_dims);
+
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+  auto begin_norm_axis = op_info->GetAttr<int>("begin_norm_axis");
+  if (begin_norm_axis < 0) {
+    begin_norm_axis += x_rank;
+  }
+  CHECK(begin_norm_axis >= 1 && begin_norm_axis < x_rank);
+  auto x_mat_dims = x_dims.Flatten2D(begin_norm_axis);
+  auto left = x_mat_dims[0];
+  auto right = x_mat_dims[1];
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x, padded_x_shape);
+  }
+
+  // Reshaped X node if needs
+  bool reshape = false;
+  if (!(x_rank == 4 && begin_norm_axis == 1)) {
+    reshape = true;
+    // Only the input shape 4-D(n, c, h, w) and axis=1 is supported
+    // by HiAI DDK, So the input shape need to be padded to 4-D if it is less
+    // than 4 or axis!=1. For example:
+    // (1) (n, c, h, w), axis=1 -> no need
+    // (2) (n, c, h, w), axis=2 -> (n * c, h, w, 1)
+    // (3) (n, c, h, w), axis=3 -> (n * c * h, w, 1)
+    // (4) (n, h, w), axis=1 -> (n, h, w, 1)
+    // (5) (n, h, w), axis=2 -> (n * h, w, 1, 1)
+    // (6) (h, w), axis=1 -> (h, w, 1, 1)
+    padded_x_shape = {left};
+    for (int i = begin_norm_axis; i < x_rank; i++) {
+      padded_x_shape.push_back(x_dims[i]);
+    }
+    auto remain = 4 - padded_x_shape.size();
+    for (int i = 0; i < remain; i++) {
+      padded_x_shape.push_back(1);
+    }
+    auto reshaped_x_node = graph->Add<ge::op::Reshape>(
+        x_name + "/reshape", x_node->precision(), x_node->layout());
+    auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
+    reshaped_x_op->set_input_tensor(*x_node->data());
+    reshaped_x_op->set_attr_shape(padded_x_shape);
+    x_node = reshaped_x_node;
+  }
+
+  // Bias node
+  auto scale_bias_dims =
+      DDim({1, padded_x_shape[1], padded_x_shape[2], padded_x_shape[3]});
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias_type = kernel->GetInputDeclType("Bias");
+    CHECK(bias_type->precision() == PRECISION(kFloat));
+    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+    CHECK_EQ(bias_dims.size(), 1);
+    CHECK_EQ(bias_dims.production(), right);
+    if (!bias->persistable()) {
+      LOG(WARNING) << "[NPU] Only supporting persistable bias tensor.";
+      return FAILED;
+    }
+    bias_node = graph->Add(bias_name, *bias, scale_bias_dims);
+  } else {
+    bias_node = graph->Add(y_name + "/bias", 0.0f, scale_bias_dims);
+  }
+
+  // Scale node
+  std::shared_ptr<Node> scale_node = nullptr;
+  if (HasInputArg(op_info, scope, "Scale")) {
+    auto scale_name = op_info->Input("Scale").front();
+    auto scale_type = kernel->GetInputDeclType("Scale");
+    CHECK(scale_type->precision() == PRECISION(kFloat));
+    CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
+    auto scale = scope->FindMutableTensor(scale_name);
+    auto scale_dims = scale->dims();
+    CHECK_EQ(scale_dims.size(), 1);
+    CHECK_EQ(scale_dims.production(), right);
+    if (!scale->persistable()) {
+      LOG(WARNING) << "[NPU] Only supporting persistable scale tensor.";
+      return FAILED;
+    }
+    scale_node = graph->Add(scale_name, *scale, scale_bias_dims);
+  } else {
+    scale_node = graph->Add(y_name + "/scale", 1.0f, scale_bias_dims);
+  }
+
+  // LayerNorm node
+  auto layer_norm_node = graph->Add<ge::op::InstanceNorm>(y_name);
+  auto layer_norm_op = layer_norm_node->data<ge::op::InstanceNorm>();
+  layer_norm_op->set_input_x(*x_node->data());
+  layer_norm_op->set_input_scale(*scale_node->data());
+  layer_norm_op->set_input_bias(*bias_node->data());
+  layer_norm_op->set_attr_reduction_indices(ge::AttrValue::LIST_INT({3}));
+  layer_norm_op->set_attr_epsilon(epsilon);
+
+  // Reshaped Y node if needs
+  if (reshape) {
+    auto reshaped_y_node = graph->Add<ge::op::Reshape>(
+        y_name, layer_norm_node->precision(), layer_norm_node->layout());
+    auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
+    reshaped_y_op->set_input_tensor(*layer_norm_node->data());
+    reshaped_y_op->set_attr_shape(padded_y_shape);
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(layer_norm,
+                         kNPU,
+                         paddle::lite::subgraph::npu::LayerNormConverter);
diff --git a/lite/kernels/npu/bridges/matmul_op.cc b/lite/kernels/npu/bridges/matmul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4621f5955a841a0ba1b63381cb956242ce69639a
--- /dev/null
+++ b/lite/kernels/npu/bridges/matmul_op.cc
@@ -0,0 +1,141 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto y_name = op_info->Input("Y").front();
+  auto y_type = kernel->GetInputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  auto y = scope->FindTensor(y_name);
+  auto y_dims = y->dims();
+
+  if (x_dims.size() == 1 || x_dims.size() != y_dims.size()) {
+    LOG(WARNING)
+        << "[NPU] dims size of x and y must be same and greater than 1.";
+    return FAILED;
+  }
+  if (y_dims.size() == 2 && !y->persistable()) {
+    LOG(WARNING) << "[NPU] y must be const if y is 2-D";
+    return FAILED;
+  }
+  if (x_dims.size() > 2 &&
+      x_dims.count(0, x_dims.size() - 2) !=
+          y_dims.count(0, y_dims.size() - 2)) {
+    LOG(WARNING) << "[NPU] batched matmul only support the same batch size";
+    return FAILED;
+  }
+
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindTensor(out_name);
+  auto out_dims = out->dims();
+
+  bool transpose_x = op_info->GetAttr<bool>("transpose_X");
+  if (x_dims.size() > 2 && transpose_x) {
+    LOG(WARNING) << "[NPU] not support transpose_x == true if x_dims size "
+                    "greater than 2.";
+    return FAILED;
+  }
+  bool transpose_y = op_info->GetAttr<bool>("transpose_Y");
+  float alpha = op_info->GetAttr<float>("alpha");
+
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+  } else {
+    y_node = graph->Add(y_name, *y);
+  }
+
+  // Matmul node
+  std::shared_ptr<Node> matmul_node = nullptr;
+  if (x_dims.size() == 2) {
+    matmul_node = graph->Add<ge::op::MatMul>(out_name);
+    auto matmul_op = matmul_node->data<ge::op::MatMul>();
+    matmul_op->set_input_x1(*x_node->data());
+    matmul_op->set_input_x2(*y_node->data());
+    matmul_op->set_attr_transpose_x1(transpose_x);
+    matmul_op->set_attr_transpose_x2(transpose_y);
+  } else {
+    matmul_node = graph->Add<ge::op::BatchMatMul>(out_name);
+    auto matmul_op = matmul_node->data<ge::op::BatchMatMul>();
+    matmul_op->set_input_x(*x_node->data());
+    matmul_op->set_input_y(*y_node->data());
+    matmul_op->set_attr_adj_x(transpose_x);
+    matmul_op->set_attr_adj_y(transpose_y);
+  }
+
+  if (fabs(alpha - 1.f) > 1e-6f) {
+    auto scaled_out_node = graph->Add<ge::op::Scale>(out_name);
+    auto scaled_out_op = scaled_out_node->data<ge::op::Scale>();
+    scaled_out_op->set_input_x(*matmul_node->data());
+    scaled_out_op->set_attr_axis(1);
+    std::vector<int64_t> scale_bias_shape(4, 1);
+    if (out_dims.size() < 4) {
+      scale_bias_shape[1] = out_dims[0];
+    } else if (out_dims.size() == 4) {
+      scale_bias_shape[1] = out_dims[1];
+    } else {
+      LOG(WARNING) << "[NPU] not support out dims size greater than 4.";
+      return FAILED;
+    }
+    auto filter_node =
+        graph->Add(out_name + "/filter", alpha, scale_bias_shape);
+    scaled_out_op->set_input_filter(*filter_node->data());
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(matmul,
+                         kNPU,
+                         paddle::lite::subgraph::npu::MatMulConverter);
diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc
index 2313351f6c49ea08451b06dc347c91aeeed4d755..e7f497bd55bc302448528412f5cfb971001f79ca 100644
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
@@ -12,31 +12,51 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-// Note: inputs_map the var_name contains only the data, the weight should be
-// handle in this converter
-node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
-                           const node_map_type& inputs_map) {
-  auto scope = mul_op->scope();
-  auto op_info = mul_op->op_info();
+// Note: all of the input weight vars should be handled in this converter
+int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  auto x_var_name = op_info->Input("X").front();
-  auto y_var_name = op_info->Input("Y").front();
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindTensor(x_name);
   auto x_dims = x->dims();
+
+  auto y_name = op_info->Input("Y").front();
+  auto y_type = kernel->GetInputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  auto y = scope->FindTensor(y_name);
   auto y_dims = y->dims();
+
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindTensor(out_name);
+  auto out_dims = out->dims();
+  if (out_dims.size() > 4) {
+    LOG(WARNING) << "[NPU] not supported above 4-D.";
+    return FAILED;
+  }
+
   int x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
   int y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims");
   int m = x_dims.Slice(0, x_num_col_dims).production();
@@ -44,61 +64,66 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
   CHECK_EQ(k, y_dims.Slice(0, y_num_col_dims).production())
       << "[NPU] columns of X must be equal with rows of Y";
   int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production();
-  LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k;
-  LOG(INFO) << "x_var_name:" << x_var_name
-            << ", is data: " << inputs_map.count(x_var_name);
-  LOG(INFO) << "y_var_name:" << y_var_name
-            << ", is data: " << inputs_map.count(y_var_name);
-  CHECK(inputs_map.count(x_var_name))
-      << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet.";
+  VLOG(3) << "m:" << m << ",n:" << n << ",k:" << k;
+  VLOG(3) << "x_name:" << x_name << ", is data: " << graph->Has(x_name);
+  VLOG(3) << "y_name:" << y_name << ", is data: " << graph->Has(y_name);
 
-  auto mul_node = std::make_shared<ge::op::MatMul>(unique_op_type);
-  // add input x node which supports persistable and non-persistable tensor, and
+  // X node which supports persistable and non-persistable tensor, and
   // reshape to (m, k)
-  if (inputs_map.count(x_var_name)) {
-    auto reshaped_x_node =
-        std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
-    reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name));
-    reshaped_x_node->set_attr_shape({m, k});
-    reshaped_x_node->set_attr_axis(0);
-    mul_node->set_input_x1(*reshaped_x_node);
-    lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-    lite::npu::OpList::Global().add(reshaped_x_node);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+    if (x_dims.size() != 2) {
+      auto reshaped_x_node = graph->Add<ge::op::Reshape>(x_name + "/reshape");
+      auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
+      reshaped_x_op->set_input_tensor(*x_node->data());
+      reshaped_x_op->set_attr_shape({m, k});
+      reshaped_x_op->set_attr_axis(0);
+      x_node = reshaped_x_node;
+    }
   } else {
-    auto x_const_node = std::make_shared<ge::op::Const>(x_var_name);
-    x_const_node->set_attr_value(lite::npu::CvtTensor(x, {m, k}));
-    mul_node->set_input_x1(*x_const_node);
-    lite::npu::OpList::Global().add(x_const_node);
+    x_node = graph->Add(x_name, *x, {m, k});
   }
-  // add input y node which only supports persistable tensor, and reshape to (k,
-  // n)
-  if (inputs_map.count(y_var_name)) {
-    auto reshaped_y_node =
-        std::make_shared<ge::op::Reshape>(y_var_name + "_reshape");
-    reshaped_y_node->set_input_tensor(*inputs_map.at(y_var_name));
-    reshaped_y_node->set_attr_shape({k, n});
-    reshaped_y_node->set_attr_axis(0);
-    mul_node->set_input_x2(*reshaped_y_node);
-    lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
-    lite::npu::OpList::Global().add(reshaped_y_node);
+
+  // Y node which only supports persistable tensor, and reshape to
+  // (k,n)
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+    if (y_dims.size() != 2) {
+      auto reshaped_y_node = graph->Add<ge::op::Reshape>(y_name + "/reshape");
+      auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
+      reshaped_y_op->set_input_tensor(*y_node->data());
+      reshaped_y_op->set_attr_shape({k, n});
+      reshaped_y_op->set_attr_axis(0);
+      y_node = reshaped_y_node;
+    }
   } else {
-    auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
-    y_const_node->set_attr_value(lite::npu::CvtTensor(y, {k, n}));
-    mul_node->set_input_x2(*y_const_node);
-    lite::npu::OpList::Global().add(y_const_node);
+    y_node = graph->Add(y_name, *y, {k, n});
   }
 
-  lite::npu::OpList::Global().add(mul_node);
+  // Matmul node
+  auto mul_node = graph->Add<ge::op::MatMul>(out_name);
+  auto mul_op = mul_node->data<ge::op::MatMul>();
+  mul_op->set_input_x1(*x_node->data());
+  mul_op->set_input_x2(*y_node->data());
+
+  if (out_dims.size() != 2) {
+    auto reshaped_out_node = graph->Add<ge::op::Reshape>(out_name);
+    auto reshaped_out_op = reshaped_out_node->data<ge::op::Reshape>();
+    reshaped_out_op->set_input_tensor(*mul_node->data());
+    auto out_shape = out_dims.Vectorize();
+    reshaped_out_op->set_attr_shape(
+        ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+    reshaped_out_op->set_attr_axis(0);
+  }
 
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = mul_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(mul, paddle::lite::kernels::npu::bridges::MulConverter);
+REGISTER_SUBGRAPH_BRIDGE(mul, kNPU, paddle::lite::subgraph::npu::MulConverter);
diff --git a/lite/kernels/npu/bridges/mul_op_test.cc b/lite/kernels/npu/bridges/mul_op_test.cc
deleted file mode 100644
index 9bcd72bb35b7bf5de19d880f4ad535fec8e480fa..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/mul_op_test.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/mul_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void mul_ref(const std::shared_ptr<operators::MulOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  int32_t x_num_col_dims = op_info->GetAttr<int32_t>("x_num_col_dims");
-  int32_t y_num_col_dims = op_info->GetAttr<int32_t>("y_num_col_dims");
-  auto x_data = x->mutable_data<float>();
-  auto y_data = y->mutable_data<float>();
-  auto out_data = out->mutable_data<float>();
-  auto x_mat_dims = x->dims().Flatten2D(x_num_col_dims);
-  auto y_mat_dims = y->dims().Flatten2D(y_num_col_dims);
-  CHECK_EQ(x_mat_dims[1], y_mat_dims[0]);
-  const int M = x_mat_dims[0];
-  const int K = x_mat_dims[1];
-  const int N = y_mat_dims[1];
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      out_data[m * N + n] = 0;
-      for (int k = 0; k < K; ++k) {
-        out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n];
-      }
-    }
-  }
-}
-
-void test_mul(const std::vector<int64_t>& x_shape,
-              const std::vector<int64_t>& y_shape,
-              int x_num_col_dims,
-              int y_num_col_dims) {
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  CHECK(bridges.HasType("mul"));
-
-  Scope scope;
-  std::string x_var_name("X");
-  std::string y_var_name("Y");
-  std::string out_var_name("Out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize(x_shape);
-  y->Resize(y_shape);
-
-  FillTensor<float, int>(x);
-  FillTensor<float, int>(y);
-
-  // create mul op
-  cpp::OpDesc mul_op_desc;
-  mul_op_desc.SetType("mul");
-  mul_op_desc.SetInput("X", {x_var_name});
-  mul_op_desc.SetInput("Y", {y_var_name});
-  mul_op_desc.SetOutput("Out", {out_var_name});
-  mul_op_desc.SetAttr("x_num_col_dims", static_cast<int>(x_num_col_dims));
-  mul_op_desc.SetAttr("y_num_col_dims", static_cast<int>(y_num_col_dims));
-
-  auto mul_op = CreateOp<operators::MulOpLite>(mul_op_desc, &scope);
-  LauchOp(mul_op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  mul_ref(mul_op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, mul) {
-  test_mul({1, 8, 8, 1}, {1, 8, 2, 2}, 2, 2);
-  test_mul({1, 5, 5, 1}, {1, 5, 7, 7}, 2, 2);
-  test_mul({1, 4, 1, 1}, {4, 8}, 1, 1);
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(mul);
-USE_NPU_BRIDGE(mul);
diff --git a/lite/kernels/npu/bridges/pad2d_op.cc b/lite/kernels/npu/bridges/pad2d_op.cc
index 1a78e4a47440bc66e98103750a4329cb889c3538..372def8a9b2e853c0b17264f9bad960dda6fb295 100644
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
@@ -12,73 +12,89 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op,
-                             const node_map_type& inputs_map) {
-  auto scope = pad2d_op->scope();
-  auto op_info = pad2d_op->op_info();
+int Pad2dConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::Pad> pad2d_node =
-      std::make_shared<ge::op::Pad>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
-  pad2d_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(pad2d_node);
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto padding = op_info->GetAttr<std::vector<int>>("paddings");
+  CHECK_EQ(padding.size(), 4);
 
-  auto mode = op_info->GetAttr<std::string>("mode");
-  if (mode == "constant") {
-    pad2d_node->set_attr_mode(0);
-  } else if (mode == "reflect") {
-    LOG(FATAL) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
-    pad2d_node->set_attr_mode(1);
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    LOG(FATAL) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
+    x_node = graph->Add(x_name, *x);
   }
 
-  auto x_dims = scope->FindTensor(x_var_name)->dims();
-  auto padding = op_info->GetAttr<std::vector<int>>("paddings");
-  CHECK_EQ(padding.size(), 4);
+  // Padding node
   int xds = x_dims.size();
   padding.insert(padding.begin(), xds * 2 - 4, 0);
-  auto npu_padding =
-      std::make_shared<ge::op::Const>(unique_op_type + "/padding");
-  npu_padding->set_attr_value(
-      lite::npu::CreateTensorAndFillData<int>(padding, {xds, 2}));
-  pad2d_node->set_input_padding(*npu_padding);
-  lite::npu::OpList::Global().add(npu_padding);
+  auto padding_node = graph->Add(out_name + "/padding", padding, {xds, 2});
 
+  // Pad node
+  auto mode = op_info->GetAttr<std::string>("mode");
   if (mode == "constant") {
+    auto pad2d_node = graph->Add<ge::op::PadV2>(out_name);
+    auto pad2d_op = pad2d_node->data<ge::op::PadV2>();
+    pad2d_op->set_input_x(*x_node->data());
+    pad2d_op->set_input_paddings(*padding_node->data());
+    // Pad value node
     auto pad_value = op_info->GetAttr<float>("pad_value");
-    auto npu_pad_value =
-        std::make_shared<ge::op::Const>(unique_op_type + "/pad_value");
-    npu_pad_value->set_attr_value(
-        lite::npu::CreateTensorAndFillData<float>({pad_value}));
-    pad2d_node->set_input_constant_values(*npu_pad_value);
-    lite::npu::OpList::Global().add(npu_pad_value);
-
-    pad2d_node->set_attr_T(0);  // type of pad_value:  0:float  3:int32
+    auto pad_value_node = graph->Add(out_name + "/pad_value", pad_value);
+    pad2d_op->set_input_constant_values(*pad_value_node->data());
+  } else {
+    auto pad2d_node = graph->Add<ge::op::Pad>(out_name);
+    auto pad2d_op = pad2d_node->data<ge::op::Pad>();
+    pad2d_op->set_input_x(*x_node->data());
+    pad2d_op->set_input_padding(*padding_node->data());
+    if (mode == "reflect") {
+      pad2d_op->set_attr_mode(1);
+      LOG(WARNING) << "[NPU] pad mode " << mode
+                   << " isn't supported in HiAI DDK";
+    } else if (mode == "edge") {
+      pad2d_op->set_attr_mode(3);
+      LOG(WARNING) << "[NPU] pad mode " << mode
+                   << " isn't supported in HiAI DDK";
+    } else {
+      LOG(WARNING) << "[NPU] pad mode " << mode
+                   << " isn't supported in HiAI DDK";
+      return FAILED;
+    }
   }
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = pad2d_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(pad2d, paddle::lite::kernels::npu::bridges::Pad2dConverter);
+REGISTER_SUBGRAPH_BRIDGE(pad2d,
+                         kNPU,
+                         paddle::lite::subgraph::npu::Pad2dConverter);
diff --git a/lite/kernels/npu/bridges/pad2d_op_test.cc b/lite/kernels/npu/bridges/pad2d_op_test.cc
deleted file mode 100644
index db39deb2e98bfae2c220b8addc0c18f105fd2c9c..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/pad2d_op_test.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/pad2d_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename dtype>
-void pad2d_ref(const std::shared_ptr<operators::Pad2dOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindMutableTensor(op_info->Input("X").front());
-  auto out = scope->FindMutableTensor(op_info->Output("Out").front());
-
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  int pad_top = paddings[0];
-  int pad_bottom = paddings[1];
-  int pad_left = paddings[2];
-  int pad_right = paddings[3];
-
-  auto mode = op_info->GetAttr<std::string>("mode");
-  int pad_mode;
-  if (mode == "constant") {
-    pad_mode = 0;
-  } else if (mode == "reflect") {
-    pad_mode = 1;
-  } else if (mode == "edge") {
-    pad_mode = 2;
-  } else {
-    LOG(FATAL) << "Unknown mode type";
-  }
-  float pad_value = op_info->GetAttr<float>("pad_value");
-
-  auto out_dims = out->dims();
-  int n = out_dims[0];
-  int c = out_dims[1];
-  int h = out_dims[2];
-  int w = out_dims[3];
-
-  int in_w = w - pad_left - pad_right;
-  int in_h = h - pad_bottom - pad_top;
-  int spatial_size_out = w * h;
-  int spatial_size_in = in_w * in_h;
-
-  auto x_data = x->data<float>();
-  auto out_data = out->mutable_data<float>();
-#pragma omp parallel for
-  for (int i = 0; i < n * c; ++i) {
-    const float* din_batch = x_data + i * spatial_size_in;
-    float* dout_batch = out_data + i * spatial_size_out;
-    int in_y = 0;
-    int in_x = 0;
-    for (int y = 0; y < h; ++y) {
-      for (int x = 0; x < w; ++x) {
-        switch (pad_mode) {
-          case 0:
-            in_y = y - pad_top;
-            in_x = x - pad_left;
-            dout_batch[y * w + x] =
-                (in_x >= 0 && in_x < in_w) && (in_y >= 0 && in_y < in_h)
-                    ? din_batch[in_y * in_w + in_x]
-                    : pad_value;
-            break;
-          case 1:
-            in_x =
-                std::min(std::max(pad_left, x), in_w + pad_left - 1) - pad_left;
-            in_y = std::min(std::max(pad_top, y), in_h + pad_top - 1) - pad_top;
-            dout_batch[y * w + x] = din_batch[in_y * in_w + in_x];
-            break;
-          case 2:
-            in_y = y - pad_top;
-            in_x = x - pad_left;
-            in_y = std::max(in_y, -in_y);
-            in_y = std::min(in_y, 2 * in_h - in_y - 2);
-            in_x = std::max(in_x, -in_x);
-            in_x = std::min(in_x, 2 * in_w - in_x - 2);
-            dout_batch[y * w + x] = din_batch[in_y * in_w + in_x];
-            break;
-          default:
-            LOG(ERROR) << "ERROR: unknown pad mode:" << pad_mode;
-        }
-      }
-    }
-  }
-}
-
-void test_pad2d(int bs,
-                int ic,
-                int ih,
-                int iw,
-                std::vector<int> paddings,
-                float pad_value,
-                std::string mode) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.NewTensor(x_var_name);
-  auto* out = scope.NewTensor(out_var_name);
-  auto* out_ref = scope.NewTensor(out_ref_var_name);
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  //  FillTensor<float, int>(x);
-  auto x_data = x->mutable_data<float>();
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("pad2d");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("paddings", paddings);
-  opdesc.SetAttr("pad_value", pad_value);
-  opdesc.SetAttr("mode", mode);
-  opdesc.SetAttr("data_format", std::string("NCHW"));
-
-  auto op = CreateOp<operators::Pad2dOpLite>(opdesc, &scope);
-  pad2d_ref<float>(op);
-  out_ref->CopyDataFrom(*out);
-
-  LauchOp(op, {x_var_name}, {out_var_name});
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->numel(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2) << "-----" << i;
-  }
-}
-
-TEST(NPUBridges, pad2d) {
-#if 1
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto paddings : {/*std::vector<int>{0, 0, 0, 0},*/
-                                std::vector<int>{0, 0, 0, 1},
-                                std::vector<int>{0, 1, 0, 2},
-                                std::vector<int>{1, 2, 3, 4}}) {
-            // npu not support pad_value!=0
-            for (auto pad_value : {0.f /*,1.f*/}) {
-              // npu only support constant
-              for (auto mode : {"constant" /*, "reflect", "edge"*/}) {
-                if (mode == "edge") continue;
-                VLOG(3) << "bs: " << bs << "  ic: " << ic << "  ih: " << ih
-                        << "  iw: " << iw << "  paddings: {" << paddings[0]
-                        << "," << paddings[1] << "," << paddings[2] << ","
-                        << paddings[3] << "}"
-                        << "  pad_value: " << pad_value << "  mode: " << mode;
-                test_pad2d(bs, ic, ih, iw, paddings, pad_value, mode);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#else
-  test_pad2d(1, 1, 1, 1, {0, 0, 0, 1}, 0, "constant");
-#endif
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(pad2d);
-USE_NPU_BRIDGE(pad2d);
diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..30d7b79c7e03dfb8176c3bdd098f35eef56a9afd
--- /dev/null
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(sigmoid, kNPU);
+USE_SUBGRAPH_BRIDGE(relu, kNPU);
+USE_SUBGRAPH_BRIDGE(tanh, kNPU);
+USE_SUBGRAPH_BRIDGE(relu_clipped, kNPU);
+USE_SUBGRAPH_BRIDGE(leaky_relu, kNPU);
+USE_SUBGRAPH_BRIDGE(softsign, kNPU);
+USE_SUBGRAPH_BRIDGE(hard_sigmoid, kNPU);
+
+USE_SUBGRAPH_BRIDGE(batch_norm, kNPU);
+USE_SUBGRAPH_BRIDGE(concat, kNPU);
+USE_SUBGRAPH_BRIDGE(conv2d, kNPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kNPU);
+USE_SUBGRAPH_BRIDGE(conv2d_transpose, kNPU);
+
+USE_SUBGRAPH_BRIDGE(dropout, kNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_div, kNPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_sub_activation, kNPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_mul_activation, kNPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_div_activation, kNPU);
+
+USE_SUBGRAPH_BRIDGE(fc, kNPU);
+USE_SUBGRAPH_BRIDGE(bilinear_interp, kNPU);
+USE_SUBGRAPH_BRIDGE(nearest_interp, kNPU);
+USE_SUBGRAPH_BRIDGE(matmul, kNPU);
+USE_SUBGRAPH_BRIDGE(mul, kNPU);
+USE_SUBGRAPH_BRIDGE(pad2d, kNPU);
+USE_SUBGRAPH_BRIDGE(pool2d, kNPU);
+USE_SUBGRAPH_BRIDGE(reduce_mean, kNPU);
+USE_SUBGRAPH_BRIDGE(reshape, kNPU);
+USE_SUBGRAPH_BRIDGE(reshape2, kNPU);
+USE_SUBGRAPH_BRIDGE(scale, kNPU);
+USE_SUBGRAPH_BRIDGE(shuffle_channel, kNPU);
+USE_SUBGRAPH_BRIDGE(softmax, kNPU);
+USE_SUBGRAPH_BRIDGE(split, kNPU);
+USE_SUBGRAPH_BRIDGE(sqrt, kNPU);
+USE_SUBGRAPH_BRIDGE(square, kNPU);
+USE_SUBGRAPH_BRIDGE(transpose, kNPU);
+USE_SUBGRAPH_BRIDGE(transpose2, kNPU);
+USE_SUBGRAPH_BRIDGE(unsqueeze, kNPU);
+USE_SUBGRAPH_BRIDGE(unsqueeze2, kNPU);
+USE_SUBGRAPH_BRIDGE(instance_norm, kNPU);
+USE_SUBGRAPH_BRIDGE(layer_norm, kNPU);
diff --git a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
deleted file mode 100644
index 8b4252de06e8934affe7592fc8ea521ad7d20025..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/kernels/npu/bridges/registry.h"
-
-USE_NPU_BRIDGE(mul);
-USE_NPU_BRIDGE(fc);
-USE_NPU_BRIDGE(conv2d);
-USE_NPU_BRIDGE(depthwise_conv2d);
-USE_NPU_BRIDGE(pool2d);
-USE_NPU_BRIDGE(relu);
-USE_NPU_BRIDGE(elementwise_add);
-USE_NPU_BRIDGE(fusion_elementwise_add_activation);
-USE_NPU_BRIDGE(scale);
-USE_NPU_BRIDGE(softmax);
-USE_NPU_BRIDGE(concat);
-USE_NPU_BRIDGE(split);
-USE_NPU_BRIDGE(transpose);
-USE_NPU_BRIDGE(transpose2);
-USE_NPU_BRIDGE(shuffle_channel);
-USE_NPU_BRIDGE(batch_norm);
-USE_NPU_BRIDGE(bilinear_interp);
-USE_NPU_BRIDGE(conv2d_transpose);
-USE_NPU_BRIDGE(reshape);
-USE_NPU_BRIDGE(reshape2);
diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc
index 5915b7a8aadfec38c1388177d726d6a33d612349..ee90d81e508dabf58b9c2525ae6cb429aef332a5 100644
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -12,73 +12,126 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/operators/pool_op.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
-                            const node_map_type& inputs_map) {
-  auto scope = pool_op->scope();
-  auto op_info = pool_op->op_info();
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::Pooling> pool_node =
-      std::make_shared<ge::op::Pooling>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
-  int npu_mode = 0;
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // pool mode
+  int mode = 0;
   if (pooling_type == "max") {
-    npu_mode = 0;
+    mode = 0;
   } else if (pooling_type == "avg") {
-    npu_mode = 1;
-    CHECK(op_info->GetAttr<bool>("exclusive"))
-        << "[NPU] exclusive must be true in HiAI DDK";
+    mode = 1;
+    if (!op_info->GetAttr<bool>("exclusive")) {
+      LOG(WARNING) << "[NPU] Only exclusive=true is supported for the pooling "
+                      "type 'avg' by HiAI DDK";
+    }
   } else {
-    LOG(FATAL) << "[NPU] Unsupported pooling type: " << pooling_type;
+    LOG(WARNING) << "[NPU] Unsupported pooling type: " << pooling_type;
+    return FAILED;
   }
-  bool npu_global_pooling = op_info->GetAttr<bool>("global_pooling");
-  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  auto npu_window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end());
 
-  auto padding = op_info->GetAttr<std::vector<int>>("paddings");
-  auto npu_pad =
-      ge::AttrValue::LIST_INT{padding[0], padding[0], padding[1], padding[1]};
-  auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end());
-  int npu_ceil_mode = 0;
-  if (op_info->HasAttr("ceil_mode")) {
-    npu_ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
+  // pad mode
+  int pad_mode = 0;
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  if (padding_algorithm == "SAME") {
+    pad_mode = 6;
+  } else if (padding_algorithm == "VALID") {
+    pad_mode = 5;
   }
 
-  pool_node->set_input_x(*inputs_map.at(x_var_name));
-  pool_node->set_attr_mode(npu_mode);
-  pool_node->set_attr_pad_mode(0);
-  pool_node->set_attr_global_pooling(npu_global_pooling);
-  pool_node->set_attr_window(npu_window);
-  pool_node->set_attr_pad(npu_pad);
-  pool_node->set_attr_stride(npu_stride);
-  pool_node->set_attr_ceil_mode(npu_ceil_mode);
-  // output_node->set_attr_data_mode(npu_data_mode);
+  // paddings and strides
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[NPU] Paddings size should be the same or twice as the inputs size.";
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  lite::operators::UpdatePadding(&paddings,
+                                 global_pooling,
+                                 adaptive,
+                                 padding_algorithm,
+                                 x->dims(),
+                                 strides,
+                                 ksize);
 
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(pool_node);
+  // ceil mode
+  int ceil_mode = 0;
+  if (op_info->HasAttr("ceil_mode")) {
+    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
+  }
 
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = pool_node;
-  return outputs_map;
+  // Pooling node
+  auto pool_node = graph->Add<ge::op::Pooling>(out_name);
+  auto pool_op = pool_node->data<ge::op::Pooling>();
+  pool_op->set_input_x(*x_node->data());
+  pool_op->set_attr_mode(mode);
+  pool_op->set_attr_pad_mode(pad_mode);
+  pool_op->set_attr_global_pooling(global_pooling);
+  pool_op->set_attr_window(ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()));
+  pool_op->set_attr_pad(ge::AttrValue::LIST_INT{
+      paddings[0], paddings[1], paddings[2], paddings[3]});
+  pool_op->set_attr_stride(
+      ge::AttrValue::LIST_INT(strides.begin(), strides.end()));
+  pool_op->set_attr_ceil_mode(ceil_mode);
+  // pool_op->set_attr_data_mode(data_mode);
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(pool2d, paddle::lite::kernels::npu::bridges::PoolConverter);
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kNPU,
+                         paddle::lite::subgraph::npu::PoolConverter);
diff --git a/lite/kernels/npu/bridges/pool_op_test.cc b/lite/kernels/npu/bridges/pool_op_test.cc
deleted file mode 100644
index d4543a6ae128a0c534b216e42c6f3488a1dbfbf9..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/pool_op_test.cc
+++ /dev/null
@@ -1,251 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/pool_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto& in_dims = x->dims();
-  auto& out_dims = out->dims();
-
-  const float* src_ptr = x->data<const float>();
-  float* dst_ptr = out->mutable_data<float>();
-
-  std::vector<int> ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  std::vector<int> strides = op_info->GetAttr<std::vector<int>>("strides");
-  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  bool exclusive = op_info->GetAttr<bool>("exclusive");
-  std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
-  bool global_pooling = op_info->GetAttr<bool>("global_pooling");
-
-  int in_n = in_dims[0];
-  int in_c = in_dims[1];
-  int in_h = in_dims[2];
-  int in_w = in_dims[3];
-  int size_in_n = in_c * in_h * in_w;
-  int size_in_c = in_h * in_w;
-
-  int out_h = out_dims[2];
-  int out_w = out_dims[3];
-  int size_out_n = in_c * out_h * out_w;
-  int size_out_c = out_h * out_w;
-
-  int window_h = ksize[0];
-  int window_w = ksize[1];
-  int stride_h = strides[0];
-  int stride_w = strides[1];
-  int pad_h = paddings[0];
-  int pad_w = paddings[1];
-
-  if (global_pooling == true) {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        const float* src = src_ptr + n * size_in_n + c * size_in_c;
-        float res = src[0];
-        if (pooling_type == "max") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res = cur_val > res ? cur_val : res;
-          }
-        } else if (pooling_type == "avg") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res += cur_val;
-          }
-          res /= size_in_c;
-        }
-        dst_ptr[n * size_out_n + c] = res;
-      }
-    }
-  } else {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        for (int h = 0; h < out_h; ++h) {
-          int sh = h * stride_h;
-          int eh = sh + window_h;
-          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
-          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
-          for (int w = 0; w < out_w; ++w) {
-            int sw = w * stride_w;
-            int ew = sw + window_w;
-            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
-            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
-            int pooling_size = (ew - sw) * (eh - sh);
-            if (pooling_size == 0) continue;
-            float res = 0.f;
-            for (int kh = sh; kh < eh; ++kh) {
-              for (int kw = sw; kw < ew; ++kw) {
-                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
-                if (kh == sh && kw == sw) {
-                  res = src_ptr[src_idx];
-                } else {
-                  if (pooling_type == "max") {
-                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
-                  }
-                  if (pooling_type == "avg") {
-                    res += src_ptr[src_idx];
-                  }
-                }
-              }
-            }
-            if (pooling_type == "avg") {
-              if (exclusive) {
-                res /= pooling_size;
-              } else {
-                res /= window_h * window_w;
-              }
-            }
-            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
-          }
-        }
-      }
-    }
-  }
-}
-
-void test_pool(int bs,
-               int ic,
-               int ih,
-               int iw,
-               std::string pooling_type,
-               bool ceil_mode,
-               bool global_pooling,
-               bool exclusive,
-               int ksize,
-               int stride,
-               int padding) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("pool2d");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("pooling_type", pooling_type);
-  opdesc.SetAttr("ksize", std::vector<int>({ksize, ksize}));
-  opdesc.SetAttr("global_pooling", global_pooling);
-  opdesc.SetAttr("exclusive", exclusive);
-  opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int>({padding, padding}));
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  pool_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, pool) {
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto ceil_mode : {true, false}) {
-      for (auto global_pooling : {/*true, */ false}) {
-        for (auto exclusive : {true /*, false*/}) {
-          for (auto ksize : {2, 3}) {
-            for (auto stride : {1, 2}) {
-              for (auto padding : {0, 1}) {
-                for (auto bs : {1, 3}) {
-                  for (auto ic : {1, 3}) {
-                    for (auto ih : {3, 7}) {
-                      for (auto iw : {3, 7}) {
-                        test_pool(bs,
-                                  ic,
-                                  ih,
-                                  iw,
-                                  pooling_type,
-                                  ceil_mode,
-                                  global_pooling,
-                                  exclusive,
-                                  ksize,
-                                  stride,
-                                  padding);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto ceil_mode : {true, false}) {
-      bool global_pooling = true;
-      bool exclusive = true;
-      int ksize = 2;
-      int stride = 1;
-      int padding = 0;
-      int bs = 6;
-      int ic = 6;
-      int ih = 6;
-      int iw = 6;
-      test_pool(bs,
-                ic,
-                ih,
-                iw,
-                pooling_type,
-                ceil_mode,
-                global_pooling,
-                exclusive,
-                ksize,
-                stride,
-                padding);
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(pool2d);
-USE_NPU_BRIDGE(pool2d);
diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..29f065675c742978638fbbb68c71dd451ca35f37
--- /dev/null
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Input("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto keep_dim = op_info->GetAttr<bool>("keep_dim");
+  auto dim = op_info->GetAttr<std::vector<int>>("dim");
+  CHECK(!dim.empty()) << "[NPU] \"dim\" of reduce_mean should not be empty.";
+  for (size_t i = 0; i < dim.size(); i++) {
+    if (dim[i] < 0) {
+      dim[i] += x_dims.size();
+    }
+  }
+  std::sort(dim.begin(), dim.end());
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Using ReduceSum + Scale to implement ReduceMean
+
+  // Dim node
+  auto dim_node = graph->Add(out_name + "/dim", dim);
+
+  // Reduce Sum node
+  auto reduce_sum_node = graph->Add<ge::op::ReduceSum>(out_name + "/reducesum");
+  auto reduce_sum_op = reduce_sum_node->data<ge::op::ReduceSum>();
+  reduce_sum_op->set_input_x(*x_node->data());
+  reduce_sum_op->set_input_w(*dim_node->data());
+  reduce_sum_op->set_attr_keep_dims(keep_dim);
+
+  // Scale node
+  auto scale_node = graph->Add<ge::op::Scale>(out_name);
+  auto scale_op = scale_node->data<ge::op::Scale>();
+  scale_op->set_input_x(*reduce_sum_node->data());
+  scale_op->set_attr_axis(1);
+
+  // Add filter node(fill with scale)
+  float scale = 1;
+  for (size_t i = 0; i < dim.size(); i++) {
+    scale /= x_dims[dim[i]];
+  }
+  std::vector<int64_t> scale_bias_shape = x_dims.Vectorize();
+  if (keep_dim) {
+    for (size_t i = 0; i < dim.size(); i++) {
+      scale_bias_shape[dim[i]] = 1;
+    }
+  } else {
+    const int64_t kDelFlag = -2;
+    for (size_t i = 0; i < dim.size(); ++i) {
+      scale_bias_shape[dim[i]] = kDelFlag;
+    }
+    scale_bias_shape.erase(
+        remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag),
+        scale_bias_shape.end());
+  }
+  auto filter_node = graph->Add(out_name + "/filter", scale, scale_bias_shape);
+  scale_op->set_input_filter(*filter_node->data());
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(reduce_mean,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ReduceMeanConverter);
diff --git a/lite/kernels/npu/bridges/reduce_mean_op_test.cc b/lite/kernels/npu/bridges/reduce_mean_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8646ce5c25b367cf3c9055f1ed13a225149a9cc7
--- /dev/null
+++ b/lite/kernels/npu/bridges/reduce_mean_op_test.cc
@@ -0,0 +1,347 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/reduce_mean_op.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+void reduce_mean_n(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] += static_cast<float>(src[src_index]) / num_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_c(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] += static_cast<float>(src[src_index]) / channel_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_h(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = 0.0;
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] += static_cast<float>(src[src_index]) / height_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_w(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = 0.0;
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] += static_cast<float>(src[src_index]) / width_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_all(const float* src,
+                     float* dst,
+                     int num_in,
+                     int channel_in,
+                     int height_in,
+                     int width_in) {
+  float mean = 0.0;
+  int src_index;
+  int n_id, c_id;
+  int all = num_in * channel_in * height_in * width_in;
+  for (int n = 0; n < num_in; ++n) {
+    n_id = n * channel_in * height_in * width_in;
+    for (int c = 0; c < channel_in; ++c) {
+      c_id = c * height_in * width_in;
+      for (int h = 0; h < height_in; ++h) {
+        for (int w = 0; w < width_in; ++w) {
+          src_index = n_id + c_id + h * width_in + w;
+          mean = src[src_index] / all;
+        }
+      }
+    }
+  }
+  dst[0] = mean;
+}
+
+void reduce_mean_nc(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+
+void reduce_mean_ch(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+
+void reduce_mean_hw(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+
+void reduce_mean_ref(const std::shared_ptr<operators::ReduceMeanOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto x_dims = x->dims();
+  auto x_data = x->data<float>();
+  auto out = scope->FindMutableTensor("out_ref");
+
+  auto dim = op_info->GetAttr<std::vector<int>>("dim");
+  auto keep_dim = op_info->GetAttr<bool>("keep_dim");
+
+  auto x_rank = x_dims.size();
+  if (!dim.empty()) {
+    for (size_t i = 0; i < dim.size(); i++) {
+      if (dim[i] < 0) {
+        dim[i] += x_rank;
+      }
+    }
+  }
+
+  bool reduce_all = false;
+  sort(dim.begin(), dim.end());
+  if (dim.size() == 0) {
+    reduce_all = true;
+  }
+
+  std::vector<int64_t> out_dims;
+  if (reduce_all) {
+    if (keep_dim) {
+      for (size_t i = 0; i < x_dims.size(); i++) {
+        out_dims.push_back(1);
+      }
+    } else {
+      out_dims.push_back(1);
+    }
+  } else {
+    for (int i = 0; i < x_dims.size(); i++) {
+      out_dims.push_back(x_dims[i]);
+    }
+    if (keep_dim) {
+      for (size_t i = 0; i < dim.size(); ++i) {
+        out_dims[dim[i]] = 1L;
+      }
+    } else {
+      int64_t kDelFlag = -2;
+      for (size_t i = 0; i < dim.size(); ++i) {
+        out_dims[dim[i]] = kDelFlag;
+      }
+      out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                     out_dims.end());
+    }
+    out->Resize(DDim(out_dims));
+  }
+
+  auto out_data = out->mutable_data<float>();
+  int in_n = x_dims[0];
+  int in_c = x_dims[1];
+  int in_h = x_dims[2];
+  int in_w = x_dims[3];
+
+  if (dim.size() == 0) {
+    reduce_mean_all(x_data, out_data, in_n, in_c, in_h, in_w);
+  } else if (dim.size() == 1) {
+    switch (dim[0]) {
+      case 0:
+        reduce_mean_n(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 1:
+        reduce_mean_c(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 2:
+        reduce_mean_h(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 3:
+        reduce_mean_w(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      default:
+        LOG(FATAL) << "error!!!";
+    }
+  } else if (dim.size() == 2) {
+    if (dim[0] == 0 && dim[1] == 1) {
+      reduce_mean_nc(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else if (dim[0] == 1 && dim[1] == 2) {
+      reduce_mean_ch(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else if (dim[0] == 2 && dim[1] == 3) {
+      reduce_mean_hw(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else {
+      LOG(FATAL) << "invalid dim!!";
+    }
+  }
+}
+
+void test_reduce_mean(const std::vector<int64_t>& input_shape,
+                      std::vector<int> dim,
+                      bool keep_dim) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("reduce_mean");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("dim", dim);
+  opdesc.SetAttr("keep_dim", keep_dim);
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ReduceMeanOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  reduce_mean_ref(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, reduce_mean) {
+  std::vector<std::vector<int>> reduce_dim{
+      {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
+  for (auto dim : reduce_dim) {
+    for (auto keep_dim : {true, false}) {
+      test_reduce_mean({1, 2, 3, 4}, dim, keep_dim);
+    }
+  }
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(reduce_mean);
+USE_NPU_BRIDGE(reduce_mean);
diff --git a/lite/kernels/npu/bridges/registry.cc b/lite/kernels/npu/bridges/registry.cc
index ead7567f41d5bb5e8c7e0f70cd9ec7f3542e196b..5a7ddd67096ce3cf45148484864b62570b70c28d 100644
--- a/lite/kernels/npu/bridges/registry.cc
+++ b/lite/kernels/npu/bridges/registry.cc
@@ -17,25 +17,41 @@
 
 namespace paddle {
 namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
+namespace subgraph {
 
-Factory& Factory::Instance() {
-  static Factory g_npu_bridge;
-  return g_npu_bridge;
+Registry& Registry::Instance() {
+  static Registry x;
+  return x;
 }
 
-bool Factory::HasType(const std::string& op_type) const {
-  return map_.count(op_type);
+void Registry::Insert(const std::string& op_type,
+                      const TargetType& target,
+                      const cvt_func_type& cvt_func_name) {
+  int key = static_cast<int>(target);
+  auto it = map_.find(key);
+  if (it == map_.end()) {
+    map_.insert(
+        std::make_pair(key, std::unordered_map<std::string, cvt_func_type>()));
+  }
+  map_.at(key).insert(std::make_pair(op_type, cvt_func_name));
 }
 
-void Factory::Insert(const std::string& op_type, const func_type& func_name) {
-  map_.insert(std::make_pair(op_type, func_name));
+const cvt_func_type& Registry::Select(const std::string& op_type,
+                                      const TargetType& target) const {
+  int key = static_cast<int>(target);
+  return map_.at(key).at(op_type);
 }
 
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
+bool Registry::Exists(const std::string& op_type,
+                      const TargetType& target) const {
+  int key = static_cast<int>(target);
+  bool found = map_.find(key) != map_.end();
+  if (found) {
+    found = map_.at(static_cast<int>(key)).find(op_type) != map_.at(key).end();
+  }
+  return found;
+}
+
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/npu/bridges/registry.h b/lite/kernels/npu/bridges/registry.h
index efbf2461c0c7e9f79b7e053bdf082f243f5d3033..0694723754dff48ba92081b01ec9ed5e2ab8c4cf 100644
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
@@ -15,44 +15,46 @@
 #pragma once
 
 #include <functional>
-#include <memory>
 #include <string>
 #include <unordered_map>
-#include <vector>
-#include "ai_ddk_lib/include/graph/operator_reg.h"
 #include "lite/core/op_lite.h"
 #include "lite/utils/macros.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
+namespace subgraph {
 
-// var_name, npu node point
-using node_map_type =
-    std::unordered_map<std::string, std::shared_ptr<ge::Operator>>;
+const int FAILED = 1;
+const int SUCCESS = 0;
+const int REBUILD_WHEN_SHAPE_CHANGED = 2;
+inline bool CHECK_FAILED(int status) { return status & FAILED; }
+inline bool CHECK_SUCCESS(int status) { return !CHECK_FAILED(status); }
+inline bool CHECK_REBUILD_WHEN_SHAPE_CHANGED(int status) {
+  return status & REBUILD_WHEN_SHAPE_CHANGED;
+}
 
-using func_type = std::function<node_map_type(const std::shared_ptr<OpLite>,
-                                              const node_map_type&)>;
-using cvt_map_type = std::unordered_map<std::string, func_type>;
-class Factory {
+using cvt_func_type =
+    std::function<int(void* ctx, OpLite* op, KernelBase* kernel)>;
+using cvt_map_type =
+    std::unordered_map<int, std::unordered_map<std::string, cvt_func_type>>;
+class Registry {
  public:
-  static Factory& Instance();
+  static Registry& Instance();
 
-  const cvt_map_type& AllFunctions() const { return map_; }
-  bool HasType(const std::string& op_type) const;
-  void Insert(const std::string& op_type, const func_type& func_name);
-  Factory() = default;
+  void Insert(const std::string& op_type,
+              const TargetType& target,
+              const cvt_func_type& cvt_func_name);
+  const cvt_func_type& Select(const std::string& op_type,
+                              const TargetType& target) const;
+  bool Exists(const std::string& op_type, const TargetType& target) const;
+  Registry() = default;
 
  private:
   cvt_map_type map_;
-  DISALLOW_COPY_AND_ASSIGN(Factory);
+  DISALLOW_COPY_AND_ASSIGN(Registry);
 };
 
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
@@ -64,23 +66,24 @@ class Factory {
 #define UNUSED __attribute__((unused))
 #endif
 
-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
+#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg)         \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
-#define REGISTER_NPU_BRIDGE(op_type, cvt_func_name)                         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                 \
-      __reg_npu_bridge_##op_type##__,                                       \
-      "REGISTER_NPU_BRIDGE must be called in global namespace only once!"); \
-  int __reg_npu_bridge_##op_type##_Insert() {                               \
-    paddle::lite::kernels::npu::bridges::Factory::Instance().Insert(        \
-        #op_type, cvt_func_name);                                           \
-    return 0;                                                               \
+#define REGISTER_SUBGRAPH_BRIDGE(op_type__, target__, cvt_func_name)      \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                          \
+      __reg_subgraph_bridge_##op_type__##_##target__##__,                 \
+      "REGISTER_SUBGRAPH_BRIDGE must be called in global namespace only " \
+      "once!");                                                           \
+  int __reg_subgraph_bridge_##op_type__##_##target__##_Insert() {         \
+    paddle::lite::subgraph::Registry::Instance().Insert(                  \
+        #op_type__, TARGET(target__), cvt_func_name);                     \
+    return 0;                                                             \
   }
 
-#define USE_NPU_BRIDGE(op_type)                                  \
-  extern int __reg_npu_bridge_##op_type##_Insert();              \
-  static int __reg_npu_bridge_##op_type##_Insert_return UNUSED = \
-      __reg_npu_bridge_##op_type##_Insert();
+#define USE_SUBGRAPH_BRIDGE(op_type__, target__)                            \
+  extern int __reg_subgraph_bridge_##op_type__##_##target__##_Insert();     \
+  static int __reg_subgraph_bridge_##op_type__##_##target__##_Insert_return \
+      UNUSED = __reg_subgraph_bridge_##op_type__##_##target__##_Insert();
diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc
index b2ed556faf543cca138dad1cb773225202fbaca5..00aa4b3497dd0f9bebbfa31b0256250b30b40a30 100644
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
@@ -13,108 +13,102 @@
 // limitations under the License.
 
 #include "lite/operators/reshape_op.h"
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
-                               const node_map_type& inputs_map) {
-  auto scope = reshape_op->scope();
-  auto op_info = reshape_op->op_info();
+int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  // get input, output and op attributes
-  auto x_var_name = op_info->Input("X").front();
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
 
-  // create reshape node and set input node from inputs_map
-  auto reshape_node = std::make_shared<ge::op::Reshape>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  reshape_node->set_input_tensor(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
 
-  // read shape from actual shape tensor as input "w" if 'Shape' is found
-  if (lite::npu::HasInputArg(op_info, scope, "Shape")) {
-    auto actual_shape_var_name = op_info->Input("Shape").front();
-    if (!inputs_map.count(actual_shape_var_name)) {
-      auto actual_shape =
-          scope->FindVar(actual_shape_var_name)->GetMutable<lite::Tensor>();
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Reshape node
+  auto reshape_node = graph->Add<ge::op::Reshape>(
+      out_name, x_node->precision(), x_node->layout());
+  auto reshape_op = reshape_node->data<ge::op::Reshape>();
+  reshape_op->set_input_tensor(*x_node->data());
+
+  // Read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr)
+  if (HasInputArg(op_info, scope, "ShapeTensor")) {
+    LOG(WARNING) << "[NPU] not support \"Shape\" from more than one Tensor.";
+    return FAILED;
+  } else if (HasInputArg(op_info, scope, "Shape")) {
+    auto actual_shape_name = op_info->Input("Shape").front();
+    // auto actual_shape_type = kernel->GetInputDeclType("Shape");
+    // CHECK(actual_shape_type->precision() == PRECISION(kInt32));
+    // CHECK(actual_shape_type->layout() == DATALAYOUT(kNCHW));
+    std::shared_ptr<Node> actual_shape_node = nullptr;
+    if (graph->Has(actual_shape_name)) {
+      actual_shape_node = graph->Get(actual_shape_name);
+    } else {
+      auto actual_shape = scope->FindMutableTensor(actual_shape_name);
       auto actual_shape_dims = actual_shape->dims();
       auto actual_shape_data = actual_shape->mutable_data<int>();
       auto shape =
           std::vector<int>(actual_shape_data,
                            actual_shape_data + actual_shape_dims.production());
-      auto out_dims = operators::ValidateShape(shape, x_dims);
-      auto out_shape = out_dims.Vectorize();
+      auto out_shape = lite::operators::ValidateShape(shape, x_dims);
       if (out_shape.size() > 4) {
         LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
                         "but Shape has "
                      << out_shape.size();
+        return FAILED;
       }
-      auto actual_shape_const_node =
-          std::make_shared<ge::op::Const>(actual_shape_var_name);
-      actual_shape_const_node->set_attr_value(
-          lite::npu::CreateTensorAndFillData(
-              std::vector<int>(out_shape.begin(), out_shape.end())));
-      reshape_node->set_input_w(*actual_shape_const_node);
-      lite::npu::OpList::Global().add(actual_shape_const_node);
-    } else {
-      reshape_node->set_input_w(*inputs_map.at(actual_shape_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(actual_shape_var_name));
+      actual_shape_node =
+          graph->Add(actual_shape_name,
+                     std::vector<int>(out_shape.begin(), out_shape.end()));
     }
+    reshape_op->set_input_w(*actual_shape_node->data());
   } else {
     auto shape = op_info->GetAttr<std::vector<int>>("shape");
-    auto out_dims = operators::ValidateShape(shape, x_dims);
-    auto out_shape = out_dims.Vectorize();
+    auto out_shape = lite::operators::ValidateShape(shape, x_dims);
     if (out_shape.size() > 4) {
       LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
                       "but shape has "
                    << out_shape.size();
+      return FAILED;
     }
-    reshape_node->set_attr_shape(
+    reshape_op->set_attr_shape(
         ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
   }
-  lite::npu::OpList::Global().add(reshape_node);
 
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = reshape_node;
-  if (op_type == "reshape2") {
-    // append an extra reshape node to calc XShape
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1, 1);
-    for (size_t i = 0; i < x_dims.size(); i++) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    if (xshape_dims.size() > 4) {
-      LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
-                      "but XShape has "
-                   << xshape_dims.size();
-    }
-    auto xshape_node =
-        std::make_shared<ge::op::Reshape>(unique_op_type + "/xshape");
-    xshape_node->set_input_tensor(*inputs_map.at(x_var_name));
-    xshape_node->set_attr_shape(
-        ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end()));
-    lite::npu::OpList::Global().add(xshape_node);
-    outputs_map[op_info->Output("XShape").front()] = xshape_node;
-  }
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(reshape,
-                    paddle::lite::kernels::npu::bridges::ReshapeConverter);
-REGISTER_NPU_BRIDGE(reshape2,
-                    paddle::lite::kernels::npu::bridges::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(reshape,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(reshape2,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ReshapeConverter);
diff --git a/lite/kernels/npu/bridges/reshape_op_test.cc b/lite/kernels/npu/bridges/reshape_op_test.cc
deleted file mode 100644
index d675b5cac2bc8975e6ed9f8521a700f579d0e2b7..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/reshape_op_test.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/reshape_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void reshape_ref(const std::shared_ptr<lite::OpLite> op) {
-  auto scope = op->scope();
-  auto op_info = op->op_info();
-  auto op_type = op_info->Type();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = x->dims();
-  auto shape = op_info->GetAttr<std::vector<int>>("shape");
-  auto inplace = op_info->GetAttr<bool>("inplace");
-  if (op_info->HasInput("Shape")) {
-    auto actual_shape_var_names = op_info->Input("Shape");
-    if (actual_shape_var_names.size() > 0) {
-      auto actual_shape = scope->FindVar(actual_shape_var_names.front())
-                              ->GetMutable<lite::Tensor>();
-      auto actual_shape_dims = actual_shape->dims();
-      auto* actual_shape_data = actual_shape->data<int>();
-      shape =
-          std::vector<int>(actual_shape_data,
-                           actual_shape_data + actual_shape_dims.production());
-    }
-  }
-  if (inplace) {
-    out->ShareDataWith(*x);
-  } else {
-    out->CopyDataFrom(*x);
-  }
-  auto out_dims = operators::ValidateShape(shape, x_dims);
-  out->Resize(out_dims);
-}
-
-void test_reshape(const std::vector<int64_t>& x_shape,
-                  const std::vector<int>& shape,
-                  const std::vector<int>& act_shape,
-                  bool inplace,
-                  bool reshape2) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string actual_shape_var_name("actual_shape");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  std::string xshape_var_name("xshape");
-  std::string xshape_ref_var_name("xshape_ref");
-  auto x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto actual_shape = scope.Var(actual_shape_var_name)->GetMutable<Tensor>();
-  auto out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  auto xshape = scope.Var(xshape_var_name)->GetMutable<Tensor>();
-  auto xshape_ref = scope.Var(xshape_ref_var_name)->GetMutable<Tensor>();
-
-  x->Resize(x_shape);
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType(reshape2 ? "reshape2" : "reshape");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("shape", shape);
-  opdesc.SetAttr("inplace", inplace);
-  if (!act_shape.empty()) {
-    int64_t act_shape_size = act_shape.size();
-    actual_shape->Resize({act_shape_size});
-    memcpy(actual_shape->mutable_data<int>(),
-           act_shape.data(),
-           act_shape_size * sizeof(int));
-    opdesc.SetInput("Shape", {actual_shape_var_name});
-  }
-  if (reshape2) {
-    opdesc.SetOutput("XShape", {xshape_var_name});
-  }
-
-  // create op and execute reference implementation
-  auto op = reshape2 ? CreateOp<operators::Reshape2Op>(opdesc, &scope)
-                     : CreateOp<operators::ReshapeOp>(opdesc, &scope);
-  reshape_ref(op);
-  out_ref->CopyDataFrom(*out);
-  if (reshape2) {
-    xshape_ref->CopyDataFrom(*xshape);
-  }
-
-  // convert op to NPU model, then run it on NPU
-  LauchOp(op,
-          {x_var_name},
-          {out_var_name});  // TODO(hong19860320) support XShape for reshape2
-
-  // compare results
-  auto out_dims = out->dims();
-  auto out_ref_dims = out_ref->dims();
-  CHECK_EQ(out_dims.size(), out_ref_dims.size());
-  for (int i = 0; i < out_dims.size(); i++) {
-    CHECK_EQ(out_dims[i], out_ref_dims[i]);
-  }
-  auto out_data = out->mutable_data<float>();
-  auto out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-  // if (reshape2) {
-  //   auto xshape_dims = xshape->dims();
-  //   auto xshape_ref_dims = xshape_ref->dims();
-  //   CHECK_EQ(xshape_dims.size(), xshape_ref_dims.size());
-  //   for (size_t i = 0; i < xshape_dims.size(); i++) {
-  //     CHECK_EQ(xshape_dims[i], xshape_ref_dims[i]);
-  //   }
-  // }
-}
-
-TEST(NPUBridges, reshape) {
-#if 1
-  std::map<std::vector<int64_t>, std::vector<std::vector<int>>> tests = {
-      {{1, 2, 4, 6},
-       {{},
-        {-1},
-        {48},
-        {-1, 48},
-        {1, 48},
-        {0, 48},
-        {48, -1},
-        {48, 1},
-        {-1, 24},
-        {2, 24},
-        {24, 0},
-        {-1, 0, 3, 2},
-        {4, 2, 3, 2},
-        {0, -1, 3, 2},
-        {1, 8, 3, 2}}}};
-  for (auto& i : tests) {
-    for (auto& shape : i.second) {
-      if (shape.empty()) {
-        continue;
-      }
-      for (auto& act_shape : i.second) {
-        for (auto& inplace : {true, false}) {
-          for (auto& reshape2 : {true, false}) {
-            std::stringstream ss;
-            ss << "x:{ ";
-            for (auto s : i.first) {
-              ss << s << " ";
-            }
-            ss << "} shape:{ ";
-            for (auto s : shape) {
-              ss << s << " ";
-            }
-            ss << "} act_shape:{ ";
-            for (auto s : act_shape) {
-              ss << s << " ";
-            }
-            VLOG(3) << ss.str() << "} inplace:" << inplace
-                    << " reshape2:" << reshape2;
-            test_reshape(i.first, shape, act_shape, inplace, reshape2);
-          }
-        }
-      }
-    }
-  }
-#else
-  test_reshape({2, 4, 6}, {-1, 0, 4, 3}, {}, true, true);
-  test_reshape({1, 232, 14, 14}, {-1, 2, 116, 14, 14}, {}, true, true);
-#endif
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(reshape);
-USE_NPU_BRIDGE(reshape);
-
-USE_LITE_OP(reshape2);
-USE_NPU_BRIDGE(reshape2);
diff --git a/lite/kernels/npu/bridges/scale_op.cc b/lite/kernels/npu/bridges/scale_op.cc
index c75d8fabdfa84c2322dee168ba10d4cf9b8266ad..d0139a9e2fd580f3143e9ad9809ed924e6e949a4 100644
--- a/lite/kernels/npu/bridges/scale_op.cc
+++ b/lite/kernels/npu/bridges/scale_op.cc
@@ -12,29 +12,40 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
-                             const node_map_type& inputs_map) {
-  auto scope = scale_op->scope();
-  auto op_info = scale_op->op_info();
+int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  // get input, output and op attributes
-  auto x_var_name = op_info->Input("X").front();
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto x_dims = x->dims().Vectorize();
-  CHECK_GE(x_dims.size(), 2);
-  std::vector<int64_t> scale_bias_shape = {x_dims[1]};
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto x_rank = x_dims.size();
+  CHECK_GE(x_rank, 2);
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  // HiAI only support [n, c, 1, 1] for the shape of scale and bias
+  std::vector<int64_t> scale_bias_shape = {
+      1, x_rank < 3 ? 1 : x_dims[x_rank - 3], 1, 1};
   float scale = op_info->GetAttr<float>("scale");
   float bias = op_info->GetAttr<float>("bias");
   bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
@@ -42,43 +53,38 @@ node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
     bias *= scale;
   }
 
-  // create scale node and set input node from inputs_map
-  auto scale_node = std::make_shared<ge::op::Scale>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  scale_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(scale_node);
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x, CvtShape(x_dims));
+  }
+
+  // Scale node
+  auto scale_node = graph->Add<ge::op::Scale>(out_name);
+  auto scale_op = scale_node->data<ge::op::Scale>();
+  scale_op->set_input_x(*x_node->data());
+  scale_op->set_attr_axis(1);
 
-  // add filter node(fill with scale)
-  auto filter_const_node =
-      std::make_shared<ge::op::Const>(unique_op_type + "/filter");
-  filter_const_node->set_attr_value(
-      lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
-  scale_node->set_input_filter(*filter_const_node);
-  lite::npu::OpList::Global().add(filter_const_node);
+  // Add filter node(fill with scale)
+  auto filter_node = graph->Add(out_name + "/filter", scale, scale_bias_shape);
+  scale_op->set_input_filter(*filter_node->data());
 
-  // add bias node(fill with bias)
+  // Add bias node(fill with bias)
   if (fabs(bias) > 1e-6f) {
-    auto bias_const_node =
-        std::make_shared<ge::op::Const>(unique_op_type + "/bias");
-    bias_const_node->set_attr_value(
-        lite::npu::CreateTensorAndFillData(bias, scale_bias_shape));
-    scale_node->set_input_bias(*bias_const_node);
-    scale_node->set_attr_has_bias_value(true);
-    lite::npu::OpList::Global().add(bias_const_node);
+    auto bias_node = graph->Add(out_name + "/bias", bias, scale_bias_shape);
+    scale_op->set_input_bias(*bias_node->data());
+    scale_op->set_attr_has_bias_value(true);
   }
-
-  scale_node->set_attr_axis(1);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = scale_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(scale, paddle::lite::kernels::npu::bridges::ScaleConverter);
+REGISTER_SUBGRAPH_BRIDGE(scale,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ScaleConverter);
diff --git a/lite/kernels/npu/bridges/shuffle_channel_op.cc b/lite/kernels/npu/bridges/shuffle_channel_op.cc
index 5af504886752f96d88765f6b430ba83d27091d56..0552bd2382041bde155b661abc053e8680dbcd3e 100644
--- a/lite/kernels/npu/bridges/shuffle_channel_op.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc
@@ -12,45 +12,59 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ShuffleChannelConverter(
-    const std::shared_ptr<lite::OpLite> shuffle_channel_op,
-    const node_map_type& inputs_map) {
-  auto scope = shuffle_channel_op->scope();
-  auto op_info = shuffle_channel_op->op_info();
+int ShuffleChannelConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::ShuffleChannel> shuffle_channel_node =
-      std::make_shared<ge::op::ShuffleChannel>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto group = op_info->GetAttr<int>("group");
 
-  shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name));
-  shuffle_channel_node->set_attr_group(op_info->GetAttr<int>("group"));
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
 
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(shuffle_channel_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = shuffle_channel_node;
-  return outputs_map;
+  // Shuffle Channel node
+  auto shuffle_channel_node = graph->Add<ge::op::ShuffleChannel>(out_name);
+  auto shuffle_channel_op =
+      shuffle_channel_node->data<ge::op::ShuffleChannel>();
+  shuffle_channel_op->set_input_x(*x_node->data());
+  shuffle_channel_op->set_attr_group(group);
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(
-    shuffle_channel,
-    paddle::lite::kernels::npu::bridges::ShuffleChannelConverter);
+REGISTER_SUBGRAPH_BRIDGE(shuffle_channel,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ShuffleChannelConverter);
diff --git a/lite/kernels/npu/bridges/shuffle_channel_op_test.cc b/lite/kernels/npu/bridges/shuffle_channel_op_test.cc
deleted file mode 100644
index cbf2eac9f3d4805e1b5bc4573189194f962c2d03..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/shuffle_channel_op_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/shuffle_channel_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void shuffle_channel_ref(
-    const std::shared_ptr<operators::ShuffleChannelOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->mutable_data<float>();
-  auto out_data = out->mutable_data<float>();
-  int group = op_info->GetAttr<int>("group");
-  auto x_dims = x->dims();
-
-  int n_size = x_dims.production() / x_dims[0];
-  int c_size = n_size / x_dims[1];
-  for (int n = 0; n < x_dims[0]; n++) {
-    int g_num = x_dims[1] / group;
-    auto tmp_out_data = out_data;
-    for (int g = 0; g < g_num; g++) {
-      auto tmp_x_data = x_data + g * c_size;
-      for (int i = 0; i < group; i++) {
-        std::memcpy(tmp_out_data,
-                    tmp_x_data + i * g_num * c_size,
-                    c_size * sizeof(float));
-        tmp_out_data += c_size;
-      }
-    }
-    x_data += n_size;
-    out_data += n_size;
-  }
-}
-
-void test_shuffle_channel(int bs, int ic, int ih, int iw, int group) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("shuffle_channel");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("group", group);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ShuffleChannelOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  shuffle_channel_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, softmax) {
-  for (auto bs : {1, 4}) {
-    for (auto ic : {1, 24, 35}) {
-      for (auto ih : {1, 4}) {
-        for (auto iw : {1, 4}) {
-          for (auto group : {1, 3, 7, 24, 35}) {
-            if (ic % group != 0) continue;
-            test_shuffle_channel(bs, ic, ih, iw, group);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(shuffle_channel);
-USE_NPU_BRIDGE(shuffle_channel);
diff --git a/lite/kernels/npu/bridges/softmax_op.cc b/lite/kernels/npu/bridges/softmax_op.cc
index 896f65ea796bc269c2c189324eea1e98eda2c11a..0ca3bc131d1f0910b9282ec53656bee53bbc5444 100644
--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
@@ -12,52 +12,67 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
-                               const node_map_type& inputs_map) {
-  auto scope = softmax_op->scope();
-  auto op_info = softmax_op->op_info();
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
-
-  std::shared_ptr<ge::op::Softmax> softmax_node =
-      std::make_shared<ge::op::Softmax>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
-
-  auto x_dims = scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims();
-  auto axis = op_info->GetAttr<int>("axis");
-  if (x_dims.size() > 3) {
-    CHECK(!(axis == 2 && x_dims[3] > 1))
-        << "[NPU] Unsupported softmax params: axis = " << axis
-        << "  :x_w = " << x_dims[3];
-  }
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  CHECK(inputs_map.count(x_var_name));
-  softmax_node->set_input_x(*inputs_map.at(x_var_name));
-  softmax_node->set_attr_axis(axis);
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto x_rank = x_dims.size();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  int axis = op_info->HasAttr("axis") ? op_info->GetAttr<int>("axis") : -1;
+  if (axis < 0) {
+    axis += x_rank;
+  }
+  if (axis == 2 && x_rank > 3 && x_dims[3] != 1) {
+    LOG(WARNING) << "[NPU] Unsupported softmax params: axis = " << axis
+                 << "  :x_w = " << x_dims[3];
+    return FAILED;
+  }
 
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(softmax_node);
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
 
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = softmax_node;
-  return outputs_map;
+  // Softmax node
+  auto softmax_node = graph->Add<ge::op::Softmax>(out_name);
+  auto softmax_op = softmax_node->data<ge::op::Softmax>();
+  softmax_op->set_input_x(*x_node->data());
+  softmax_op->set_attr_axis(axis);
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(softmax,
-                    paddle::lite::kernels::npu::bridges::SoftmaxConverter);
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kNPU,
+                         paddle::lite::subgraph::npu::SoftmaxConverter);
diff --git a/lite/kernels/npu/bridges/softmax_op_test.cc b/lite/kernels/npu/bridges/softmax_op_test.cc
deleted file mode 100644
index 3401a0f89db88eca21fda9d5654b73fd348a5ed0..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/softmax_op_test.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/softmax_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename dtype>
-void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->data<dtype>();
-  auto out_data = out->mutable_data<dtype>();
-  DDim x_dims = x->dims();
-
-  auto x_rank = x_dims.size();
-  int axis = op_info->GetAttr<int>("axis");
-  if (axis < 0) {
-    axis += x_rank;
-  }
-  int axis_size = x_dims[axis];
-  int outer_num = x_dims.Slice(0, axis).production();
-  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
-  int compute_size = outer_num * inner_num;
-  for (int i = 0; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int start = idx_outer * inner_num + idx_inner;
-    int offset;
-
-    offset = start;
-    dtype max_data = std::numeric_limits<dtype>::lowest();
-    for (int j = 0; j < axis_size; j++) {
-      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
-      offset += inner_num;
-    }
-
-    offset = start;
-    dtype sum_data = (dtype)0;
-    for (int j = 0; j < axis_size; j++) {
-      out_data[offset] = exp(x_data[offset] - max_data);
-      sum_data += out_data[offset];
-      offset += inner_num;
-    }
-
-    offset = start;
-    for (int j = 0; j < axis_size; j++) {
-      out_data[offset] /= sum_data;
-      offset += inner_num;
-    }
-  }
-}
-
-void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize(input_shape);
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("softmax");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  softmax_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, softmax) {
-  test_softmax({1, 4}, -1);
-  // Bug exists in HiAI DDK when the number of items > 16500
-  // test_softmax({1, 16500}, -1);
-  test_softmax({1, 4}, 0);
-  test_softmax({1, 4}, 1);
-  test_softmax({3, 4}, -1);
-  test_softmax({3, 4}, 0);
-  test_softmax({3, 4}, 1);
-  test_softmax({1, 4, 7}, -1);
-  test_softmax({1, 4, 7}, 0);
-  // Bug exists in HiAI DDK when axis is 1 and iw > 1
-  // test_softmax({1, 4, 7}, 1);
-  test_softmax({1, 4, 1}, 1);
-  test_softmax({1, 4, 7}, 2);
-  test_softmax({3, 4, 7}, -1);
-  test_softmax({3, 4, 7}, 0);
-  test_softmax({3, 4, 1}, 1);
-  test_softmax({3, 4, 7}, 2);
-  test_softmax({1, 4, 7, 9}, -1);
-  test_softmax({1, 4, 7, 9}, 0);
-  test_softmax({1, 4, 7, 9}, 1);
-  // Bug exists in HiAI DDK when axis is 2 and iw > 1
-  // test_softmax({1, 4, 7, 9}, 2);
-  test_softmax({1, 4, 7, 1}, 2);
-  test_softmax({1, 4, 7, 9}, 3);
-  test_softmax({3, 4, 7, 9}, -1);
-  test_softmax({3, 4, 7, 9}, 0);
-  test_softmax({3, 4, 7, 9}, 1);
-  test_softmax({3, 4, 7, 1}, 2);
-  test_softmax({3, 4, 7, 9}, 3);
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(softmax);
-USE_NPU_BRIDGE(softmax);
diff --git a/lite/kernels/npu/bridges/split_op.cc b/lite/kernels/npu/bridges/split_op.cc
index 2d45c3f557870fe5e2732011d4a942cbea4e39a3..2cdf49fd540bc40ceaaa45df4a6ac65bf94f172a 100644
--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
@@ -12,70 +12,79 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op,
-                             const node_map_type& inputs_map) {
-  lite::Scope* scope = split_op->scope();
-  const lite::OpInfo* op_info = split_op->op_info();
+int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " << op_type << " ... ";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " << op_type << " ... ";
 
-  auto x_var_name = op_info->Input("X").front();
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_names = op_info->Output("Out");
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto axis = op_info->GetAttr<int>("axis");
   auto num = op_info->GetAttr<int>("num");
   auto sections = op_info->GetAttr<std::vector<int>>("sections");
   int64_t sections_num = static_cast<int64_t>(sections.size());
 
-  std::shared_ptr<ge::op::Split> output_node =
-      std::make_shared<ge::op::Split>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  output_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
 
-  output_node->set_attr_axis(static_cast<int64_t>(axis));
+  // Split node
+  auto split_node = graph->Add<ge::op::Split>(op_type + "/" + x_name);
+  auto split_op = split_node->data<ge::op::Split>();
+  split_op->set_input_x(*x_node->data());
+  split_op->set_attr_axis(static_cast<int64_t>(axis));
   if (num > 0) {
-    output_node->set_attr_output_num(static_cast<int64_t>(num));
+    split_op->set_attr_output_num(static_cast<int64_t>(num));
   } else {
-    output_node->set_attr_output_num(sections_num);
+    split_op->set_attr_output_num(sections_num);
     auto size_split = ge::AttrValue::LIST_INT(sections.begin(), sections.end());
-    output_node->set_attr_size_split(size_split);
+    split_op->set_attr_size_split(size_split);
   }
 
-  node_map_type outputs_map;
-  auto out_var_names = op_info->Output("Out");
-  output_node->create_dynamic_output_y(out_var_names.size());
-  int index = 1;
-  for (auto out_var_name : out_var_names) {
-    auto const_node = std::make_shared<ge::op::Const>(
-        unique_op_type + "/const_zero" + std::to_string(index));
-    const_node->set_attr_value(lite::npu::CreateTensorAndFillData(0));
-    lite::npu::OpList::Global().add(const_node);
-    auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add" +
-                                                  std::to_string(index));
-    add_node->set_input_x1(*output_node, "y" + std::to_string(index));
-    add_node->set_input_x2(*const_node);
-    outputs_map[out_var_name] = add_node;
-    lite::npu::OpList::Global().add(add_node);
-    index++;
+  split_op->create_dynamic_output_y(out_names.size());
+  int idx = 1;
+  for (auto& out_name : out_names) {
+    auto zero_node = graph->Add(out_name + "/zero" + std::to_string(idx), 0);
+    auto add_node = graph->Add<ge::op::Add>(out_name);
+    auto add_op = add_node->data<ge::op::Add>();
+    add_op->set_input_x1(*split_node->data(), "y" + std::to_string(idx));
+    add_op->set_input_x2(*zero_node->data());
+    idx++;
   }
-
-  lite::npu::OpList::Global().add(output_node);
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(split, paddle::lite::kernels::npu::bridges::SplitConverter);
+REGISTER_SUBGRAPH_BRIDGE(split,
+                         kNPU,
+                         paddle::lite::subgraph::npu::SplitConverter);
diff --git a/lite/kernels/npu/bridges/sqrt_op.cc b/lite/kernels/npu/bridges/sqrt_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8fde2272a28823763f096e087be5f024734cf1b
--- /dev/null
+++ b/lite/kernels/npu/bridges/sqrt_op.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Sqrt node
+  auto sqrt_node = graph->Add<ge::op::Sqrt>(out_name);
+  auto sqrt_op = sqrt_node->data<ge::op::Sqrt>();
+  sqrt_op->set_input_x(*x_node->data());
+  return SUCCESS;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(sqrt,
+                         kNPU,
+                         paddle::lite::subgraph::npu::SqrtConverter);
diff --git a/lite/kernels/npu/bridges/act_op_test.cc b/lite/kernels/npu/bridges/sqrt_op_test.cc
similarity index 51%
rename from lite/kernels/npu/bridges/act_op_test.cc
rename to lite/kernels/npu/bridges/sqrt_op_test.cc
index 420de655dcdfb2069948399525bc4a8a561d0fd5..015d61685b2d99c3df55269442d61b4a137a2ca3 100644
--- a/lite/kernels/npu/bridges/act_op_test.cc
+++ b/lite/kernels/npu/bridges/sqrt_op_test.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
-#include <random>
+#include <cmath>
 #include "lite/core/op_registry.h"
 #include "lite/kernels/npu/bridges/registry.h"
 #include "lite/kernels/npu/bridges/test_helper.h"
-#include "lite/operators/relu_op.h"
+#include "lite/operators/activation_ops.h"
 
 namespace paddle {
 namespace lite {
@@ -25,71 +25,62 @@ namespace kernels {
 namespace npu {
 namespace bridges {
 
-void relu_ref(const std::shared_ptr<operators::ReluOp> op) {
+template <typename dtype>
+void sqrt_ref(const std::shared_ptr<operators::ActivationOp> op) {
   Scope* scope = op->scope();
   const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->data<float>();
-  auto out_data = out->mutable_data<float>();
-  DDim x_dims = x->dims();
-  DDim out_dims = out->dims();
-  CHECK_EQ(x_dims.production(), out_dims.production());
-  for (int i = 0; i < out_dims.production(); i++) {
-    out_data[i] = std::max(0.f, x_data[i]);
+
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+
+  for (size_t i = 0; i < x->numel(); i++) {
+    out_data[i] = std::sqrtf(x_data[i]);
   }
 }
 
-void test_relu(int bs, int ic, int ih, int iw) {
+void test_sqrt(const std::vector<int64_t>& input_shape) {
   // prepare input&output variables
   Scope scope;
-  std::string x_var_name("x");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(input_shape);
 
   // initialize input&output data
-  FillTensor<float, int>(x);
+  FillTensor<float>(x, 0, 5);
 
   // initialize op desc
   cpp::OpDesc opdesc;
-  opdesc.SetType("relu");
+  opdesc.SetType("sqrt");
   opdesc.SetInput("X", {x_var_name});
   opdesc.SetOutput("Out", {out_var_name});
 
   // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ReluOp>(opdesc, &scope);
+  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
   LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
 
   // execute reference implementation and save to output tensor
-  relu_ref(op);
+  sqrt_ref<float>(op);
 
   // compare results
   auto* out_data = out->mutable_data<float>();
   auto* out_ref_data = out_ref->mutable_data<float>();
   for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
   }
 }
 
-TEST(NPUBridges, relu) {
-  for (auto bs : {1, 3}) {
-    for (auto ic : {3, 4}) {
-      for (auto ih : {2, 5}) {
-        for (auto iw : {5, 9}) {
-          VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
-                  << " iw: " << iw;
-          test_relu(bs, ic, ih, iw);
-        }
-      }
-    }
-  }
+TEST(NPUBridges, sqrt) {
+  test_sqrt({2});
+  test_sqrt({2, 3});
+  test_sqrt({1, 2, 3, 4});
+  test_sqrt({5, 6, 7, 8});
 }
 
 }  // namespace bridges
@@ -98,5 +89,5 @@ TEST(NPUBridges, relu) {
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_OP(relu);
-USE_NPU_BRIDGE(relu);
+USE_LITE_OP(sqrt);
+USE_NPU_BRIDGE(sqrt);
diff --git a/lite/kernels/npu/bridges/square_op.cc b/lite/kernels/npu/bridges/square_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f03c7690cb490556fe6b26a132454ca109f41310
--- /dev/null
+++ b/lite/kernels/npu/bridges/square_op.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Square node
+  auto square_node = graph->Add<ge::op::Square>(out_name);
+  auto square_op = square_node->data<ge::op::Square>();
+  square_op->set_input_x(*x_node->data());
+  return SUCCESS;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(square,
+                         kNPU,
+                         paddle::lite::subgraph::npu::SquareConverter);
diff --git a/lite/kernels/npu/bridges/square_op_test.cc b/lite/kernels/npu/bridges/square_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d715c11430096a0b6503fbe6047a40c3c29ba8f5
--- /dev/null
+++ b/lite/kernels/npu/bridges/square_op_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+#include "lite/operators/activation_ops.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+template <typename dtype>
+void square_ref(const std::shared_ptr<operators::ActivationOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+
+  for (size_t i = 0; i < x->numel(); i++) {
+    out_data[i] = x_data[i] * x_data[i];
+  }
+}
+
+void test_square(const std::vector<int64_t>& input_shape) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("square");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  square_ref<float>(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, square) {
+  test_square({2});
+  test_square({2, 3});
+  test_square({1, 2, 3, 4});
+  test_square({5, 6, 7, 8});
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(square);
+USE_NPU_BRIDGE(square);
diff --git a/lite/kernels/npu/bridges/test_helper.cc b/lite/kernels/npu/bridges/test_helper.cc
deleted file mode 100644
index b410a4190d86f2ddf020e7f223787acc0108a398..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/test_helper.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/npu/bridges/test_helper.h"
-#include <utility>
-#include "lite/backends/npu/builder.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/operators/graph_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void LauchOp(const std::shared_ptr<lite::OpLite> op,
-             const std::vector<std::string>& input_var_names,
-             const std::vector<std::string>& output_var_names) {
-  auto scope = op->scope();
-  auto op_type = op->op_info()->Type();
-
-  // convert op to IR graph
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  CHECK(bridges.HasType(op_type));
-
-  node_map_type inputs_map;
-  for (auto input_var_name : input_var_names) {
-    auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
-    ge::TensorDesc input_desc(
-        ge::Shape(input->dims().Vectorize()), ge::FORMAT_NCHW, ge::DT_FLOAT);
-    auto input_node = std::make_shared<ge::op::Data>(input_var_name);
-    input_node->update_input_desc_x(input_desc);
-    lite::npu::OpList::Global().add(input_node);
-    inputs_map[input_var_name] = input_node;
-  }
-  auto outputs_map = supported_lists.at(op_type)(op, inputs_map);
-  CHECK_GT(outputs_map.size(), 0);
-
-  // compile IR graph to om model
-  std::vector<ge::Operator> graph_inputs;
-  for (auto input_var_name : input_var_names) {
-    graph_inputs.push_back(*inputs_map[input_var_name]);
-  }
-  std::vector<ge::Operator> graph_outputs;
-  for (auto output_var_name : output_var_names) {
-    graph_outputs.push_back(*outputs_map[output_var_name]);
-  }
-  std::string weight_var_name = "weight";
-  auto weight = scope->Var(weight_var_name)->GetMutable<Tensor>();
-  weight->set_persistable(true);
-  weight->set_precision(PRECISION(kInt8));
-  CHECK(lite::npu::BuildModel(graph_inputs, graph_outputs, weight));
-  CHECK_GT(weight->numel(), 0);
-  CHECK_NE(weight->data<uint8_t>(), 0);
-
-  // create graph op and set inputs and outputs
-  cpp::OpDesc graph_op_desc;
-  graph_op_desc.SetType("graph_op");
-  graph_op_desc.SetInput("Inputs", input_var_names);
-  graph_op_desc.SetInput("Weight", {weight_var_name});
-  graph_op_desc.SetOutput("Outputs", output_var_names);
-
-  auto graph_op =
-      std::make_shared<operators::GraphOpLite>(graph_op_desc.Type());
-  graph_op->SetValidPlaces({Place{TARGET(kNPU), PRECISION(kFloat)}});
-  CHECK(graph_op->Attach(graph_op_desc, scope));
-  CHECK(graph_op->CheckShape());
-  CHECK(graph_op->InferShape());
-
-  // create graph op kernel and set NPU context
-  auto graph_kernels =
-      graph_op->CreateKernels({Place{TARGET(kNPU), PRECISION(kFloat)}});
-  CHECK(!graph_kernels.empty());
-  auto graph_kernel =
-      std::move(graph_kernels.front());  // use the first kernel by default
-  auto graph_ctx = ContextScheduler::Global().NewContext(TARGET(kNPU));
-  graph_kernel->SetContext(std::move(graph_ctx));
-
-  // perform graph op kernel and store to output variables
-  graph_kernel->Launch();
-
-  // release all of resources of generated model
-  lite::npu::OpList::Global().clear();
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(graph_op);
-USE_LITE_KERNEL(graph_op, kNPU, kFloat, kNCHW, def);
diff --git a/lite/kernels/npu/bridges/test_helper.h b/lite/kernels/npu/bridges/test_helper.h
deleted file mode 100644
index 4fe22ba28b8f4d7af32518c8a25739903f18c4d1..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/test_helper.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <random>
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename T>
-std::shared_ptr<T> CreateOp(const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  auto op = std::make_shared<T>(opdesc.Type());
-  op->SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)},
-                      Place{TARGET(kARM), PRECISION(kFloat)},
-                      Place{TARGET(kNPU), PRECISION(kFloat)}});
-  CHECK(op->Attach(opdesc, scope));
-  CHECK(op->CheckShape());
-  CHECK(op->InferShape());
-  return op;
-}
-
-// T is the target data type
-// R is the range data type, e.g. int, half
-template <typename T, typename R = float>
-void FillTensor(Tensor* x,
-                T lower = static_cast<T>(-2),
-                T upper = static_cast<T>(2)) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-
-  T* x_data = x->mutable_data<T>();
-  for (int i = 0; i < x->dims().production(); ++i) {
-    auto r = uniform_dist(rng) * (upper - lower) + lower;
-    x_data[i] = static_cast<T>(static_cast<R>(r));
-  }
-}
-
-void LauchOp(const std::shared_ptr<lite::OpLite> op,
-             const std::vector<std::string>& input_var_names,
-             const std::vector<std::string>& output_var_names);
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/npu/bridges/transpose_op.cc b/lite/kernels/npu/bridges/transpose_op.cc
index 7beab3b7ffd199be36f9f095057bdf8986ef72e2..97df11f923d9aad6a49b2251ff985f9dc29ef629 100644
--- a/lite/kernels/npu/bridges/transpose_op.cc
+++ b/lite/kernels/npu/bridges/transpose_op.cc
@@ -12,64 +12,66 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type TransposeConverter(
-    const std::shared_ptr<lite::OpLite> transpose_op,
-    const node_map_type& inputs_map) {
-  auto scope = transpose_op->scope();
-  auto op_info = transpose_op->op_info();
+int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::Permute> transpose_node =
-      std::make_shared<ge::op::Permute>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
 
-  // paddlelite doesn't have this input
-  // w must be set, but it does nothing
-  auto w_var_name = unique_op_type + "/w";
-  auto* w = scope->Var(w_var_name)->GetMutable<Tensor>();
-  w->Resize({1});
-  auto* w_data = w->mutable_data<float>();
-  for (int i = 0; i < w->numel(); i++) {
-    w_data[i] = 1.f;
-  }
-  auto npu_w = std::make_shared<ge::op::Const>(w_var_name);
-  npu_w->set_attr_value(lite::npu::CvtTensor(w));
-  lite::npu::OpList::Global().add(npu_w);
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
   auto axis = op_info->GetAttr<std::vector<int>>("axis");
-  auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end());
-
-  CHECK(inputs_map.count(x_var_name));
-  transpose_node->set_input_x(*inputs_map.at(x_var_name));
-  transpose_node->set_input_w(*npu_w);
-  transpose_node->set_attr_order(npu_axis);
 
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(transpose_node);
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
 
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = transpose_node;
-  return outputs_map;
+  // Transpose node
+  auto transpose_node = graph->Add<ge::op::Permute>(out_name);
+  auto transpose_op = transpose_node->data<ge::op::Permute>();
+  transpose_op->set_input_x(*x_node->data());
+  auto w_node = graph->Add(out_name + "/w", 1.0f);
+  transpose_op->set_input_w(*w_node->data());
+  transpose_op->set_attr_order(
+      ge::AttrValue::LIST_INT(axis.begin(), axis.end()));
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(transpose,
-                    paddle::lite::kernels::npu::bridges::TransposeConverter);
-REGISTER_NPU_BRIDGE(transpose2,
-                    paddle::lite::kernels::npu::bridges::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(transpose,
+                         kNPU,
+                         paddle::lite::subgraph::npu::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(transpose2,
+                         kNPU,
+                         paddle::lite::subgraph::npu::TransposeConverter);
diff --git a/lite/kernels/npu/bridges/transpose_op_test.cc b/lite/kernels/npu/bridges/transpose_op_test.cc
deleted file mode 100644
index 9ad2610caa4f1674c1a07afd62a4b85361ec6645..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/transpose_op_test.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/transpose_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-int data_index(std::vector<int> pos, DDimLite dims) {
-  int d1 = dims[1];
-  int d2 = dims[2];
-  int d3 = dims[3];
-  return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1;
-}
-
-std::vector<int> pos_trans(std::vector<int> in_pos, std::vector<int> axis) {
-  std::vector<int> out_pos(in_pos.size());
-  for (int i = 0; i < axis.size(); i++) {
-    out_pos[axis[i]] = in_pos[i];
-  }
-  return out_pos;
-}
-
-void transpose_ref(const std::shared_ptr<operators::TransposeOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto input =
-      scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto output =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = input->dims();
-  auto y_dims = output->dims();
-  auto axis = op_info->GetAttr<std::vector<int>>("axis");
-
-  auto* input_data = input->data<float>();
-  auto* output_data = output->mutable_data<float>();
-
-  int input_n = x_dims[0];
-  int input_c = x_dims[1];
-  int input_h = x_dims[2];
-  int input_w = x_dims[3];
-  int output_n = y_dims[0];
-  int output_c = y_dims[1];
-  int output_h = y_dims[2];
-  int output_w = y_dims[3];
-
-  for (int n = 0; n < input_n; ++n) {
-    for (int c = 0; c < input_c; ++c) {
-      for (int h = 0; h < input_h; ++h) {
-        for (int w = 0; w < input_w; ++w) {
-          std::vector<int> in_pos{n, c, h, w};
-          std::vector<int> out_pos = pos_trans(in_pos, axis);
-          int in_index = data_index(in_pos, x_dims);
-          int out_index = data_index(out_pos, y_dims);
-          output_data[out_index] = input_data[in_index];
-        }
-      }
-    }
-  }
-}
-
-void test_transpose(int bs, int ic, int ih, int iw, std::vector<int> axis) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("transpose");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::TransposeOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  transpose_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, transpose) {
-#if 0
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto axis : {std::vector<int>{0, 1, 2, 3},
-                            std::vector<int>{0, 1, 3, 2},
-                            std::vector<int>{0, 3, 1, 2},
-                            std::vector<int>{1, 2, 3, 0},
-                            std::vector<int>{3, 2, 1, 0},
-                            std::vector<int>{2, 3, 1, 0}}) {
-            test_transpose(bs, ic, ih, iw, axis);
-          }
-        }
-      }
-    }
-  }
-#endif
-  test_transpose(2, 3, 4, 5, std::vector<int>{0, 1, 3, 2});
-  // test_transpose(2, 3, 4, 5, std::vector<int>{0, 1, 2, 3});
-  // test_transpose(2, 2, 2, 2, std::vector<int>{0,1,3,2});
-  // test_transpose(1, 1, 2, 2, std::vector<int>{0,1,3,2});
-  // test_transpose(1, 1, 1, 2, std::vector<int>{0,1,2,3});
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(transpose);
-USE_NPU_BRIDGE(transpose);
-
-USE_LITE_OP(transpose2);
-USE_NPU_BRIDGE(transpose2);
diff --git a/lite/kernels/npu/bridges/unsqueeze_op.cc b/lite/kernels/npu/bridges/unsqueeze_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bcb3bee83be97133cd7eebc7ae69cbc94080d74d
--- /dev/null
+++ b/lite/kernels/npu/bridges/unsqueeze_op.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int UnsqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " << op_type << "... ";
+
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out_shape = scope->FindTensor(out_name)->dims().Vectorize();
+  CHECK(op_info->HasAttr("axes"))
+      << "[NPU] unsqueeze not support axes from tensor now";
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Unsqueeze node
+  auto unsqueeze_node = graph->Add<ge::op::Reshape>(out_name);
+  auto unsqueeze_op = unsqueeze_node->data<ge::op::Reshape>();
+  unsqueeze_op->set_input_tensor(*x_node->data());
+  unsqueeze_op->set_attr_shape(
+      ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(unsqueeze,
+                         kNPU,
+                         paddle::lite::subgraph::npu::UnsqueezeConverter);
+REGISTER_SUBGRAPH_BRIDGE(unsqueeze2,
+                         kNPU,
+                         paddle::lite::subgraph::npu::UnsqueezeConverter);
diff --git a/lite/backends/npu/builder.cc b/lite/kernels/npu/bridges/utility.cc
similarity index 51%
rename from lite/backends/npu/builder.cc
rename to lite/kernels/npu/bridges/utility.cc
index ad5bed5be91298744abc0675bf12adb117afb60b..d9c9ffae923631d20c462149a57fccf3335836fd 100644
--- a/lite/backends/npu/builder.cc
+++ b/lite/kernels/npu/bridges/utility.cc
@@ -12,59 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
-#include <mutex>  // NOLINT
+#include "lite/kernels/npu/bridges/utility.h"
 #include <utility>
-#include "lite/backends/npu/runtime.h"
 
 namespace paddle {
 namespace lite {
+namespace subgraph {
 namespace npu {
 
-// Build HIAI IR graph to om model, and store om model data into lite tensor
-bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
-                std::vector<ge::Operator>& outputs,  // NOLINT
-                lite::Tensor* model_data) {
-  LOG(INFO) << "[NPU] Build model.";
-  CHECK_GT(inputs.size(), 0);
-  CHECK_GT(outputs.size(), 0);
-  CHECK_NE(model_data, 0);
-  // build IR graph to om model
-  ge::Graph ir_graph("graph");
-  ir_graph.SetInputs(inputs).SetOutputs(outputs);
-  ge::Model om_model("model", "model");
-  om_model.SetGraph(ir_graph);
-  domi::HiaiIrBuild ir_build;
-  domi::ModelBufferData om_model_buf;
-  if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
-    LOG(WARNING) << "[NPU] CreateModelBuff failed!";
-    return false;
-  }
-  if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
-    LOG(WARNING) << "[NPU] BuildIRModel failed!";
-    return false;
-  }
-  // store om model into tensor
-  model_data->Resize({om_model_buf.length});
-  memcpy(model_data->mutable_data<int8_t>(),
-         om_model_buf.data,
-         om_model_buf.length);
-  ir_build.ReleaseModelBuff(om_model_buf);
-  return true;
-}
-
-std::string UniqueName(const std::string& prefix) {
-  static std::mutex counter_mtx;
-  static std::unordered_map<std::string, int> counter_map;
-  std::unique_lock<std::mutex> counter_lck(counter_mtx);
-  int counter = 1;
-  auto it = counter_map.find(prefix);
-  if (it == counter_map.end()) {
-    counter_map[prefix] = counter;
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
   } else {
-    counter = ++(it->second);
+    return false;
   }
-  return prefix + "_" + std::to_string(counter);
 }
 
 ge::DataType CvtPrecisionType(PrecisionType itype) {
@@ -73,12 +44,21 @@ ge::DataType CvtPrecisionType(PrecisionType itype) {
     case PRECISION(kFloat):
       otype = ge::DT_FLOAT;
       break;
+    case PRECISION(kFP16):
+      otype = ge::DT_FLOAT16;
+      break;
     case PRECISION(kInt8):
       otype = ge::DT_INT8;
       break;
+    case PRECISION(kInt16):
+      otype = ge::DT_INT16;
+      break;
     case PRECISION(kInt32):
       otype = ge::DT_INT32;
       break;
+    case PRECISION(kInt64):
+      otype = ge::DT_INT64;
+      break;
     default:
       LOG(FATAL) << "[NPU] Can not convert precision type("
                  << PrecisionToStr(itype) << ") from Lite to NPU";
@@ -93,6 +73,9 @@ ge::Format CvtDataLayoutType(DataLayoutType itype) {
     case DATALAYOUT(kNCHW):
       otype = ge::FORMAT_NCHW;
       break;
+    case DATALAYOUT(kNHWC):
+      otype = ge::FORMAT_NHWC;
+      break;
     // TODO(hong19860320) support more data layout type
     default:
       LOG(FATAL) << "[NPU] Can not convert data layout type("
@@ -102,61 +85,64 @@ ge::Format CvtDataLayoutType(DataLayoutType itype) {
   return otype;
 }
 
-ge::TensorPtr CvtTensor(lite::Tensor* in_tensor,
-                        std::vector<int64_t> out_shape,
-                        PrecisionType in_ptype,
-                        DataLayoutType in_ltype) {
-  uint8_t* in_data = nullptr;
-  auto in_size = in_tensor->dims().production();
-  auto in_shape = in_tensor->dims().Vectorize();
-  if (out_shape.empty()) {
-    out_shape = in_shape;
+std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape) {
+  std::vector<int64_t> out_shape;
+  // Padding the shape to 4-dimensions(NCHW)
+  for (int i = 0; i < 4 - in_shape.size(); i++) {
+    out_shape.push_back(1);
   }
-  int in_bytes;
-  if (in_ptype == PRECISION(kFloat)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>());
-    in_bytes = in_size * sizeof(float);
-  } else if (in_ptype == PRECISION(kInt32)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>());
-    in_bytes = in_size * sizeof(int32_t);
-  } else if (in_ptype == PRECISION(kInt8)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
-    in_bytes = in_size * sizeof(int8_t);
-  } else {
-    LOG(FATAL) << "[NPU] Unknow precision type " << PrecisionToStr(in_ptype);
+  for (int i = 0; i < in_shape.size(); i++) {
+    out_shape.push_back(in_shape[i]);
   }
-  ge::DataType out_ptype = CvtPrecisionType(in_ptype);
-  ge::Format out_ltype = CvtDataLayoutType(in_ltype);
+  return out_shape;
+}
 
-  ge::TensorDesc out_desc(ge::Shape(out_shape), out_ltype, out_ptype);
-  CHECK_EQ(out_ltype, ge::FORMAT_NCHW);
+std::vector<int64_t> CvtShape(const DDim& in_dims) {
+  return CvtShape(in_dims.Vectorize());
+}
 
+ge::TensorPtr CvtTensor(const Tensor& in_tensor,
+                        std::vector<int64_t> out_shape,
+                        DataLayoutType in_layout) {
+  PrecisionType in_precision = in_tensor.precision();
+  auto in_size = in_tensor.dims().production();
+  auto in_shape = in_tensor.dims().Vectorize();
+  if (out_shape.empty()) {
+    out_shape = in_shape;
+  }
+  ge::TensorDesc out_desc(ge::Shape(out_shape),
+                          CvtDataLayoutType(in_layout),
+                          CvtPrecisionType(in_precision));
   auto out_size = out_desc.GetShape().GetShapeSize();
   CHECK_EQ(out_size, in_size);
-
   ge::TensorPtr out_tensor = std::make_shared<ge::Tensor>();
   out_tensor->SetTensorDesc(out_desc);
-  out_tensor->SetData(in_data, in_bytes);
+  out_tensor->SetData(reinterpret_cast<const uint8_t*>(in_tensor.raw_data()),
+                      in_tensor.memory_size());
   return out_tensor;
 }
 
 int CvtActMode(std::string act_type) {
   int act_mode = 1;
-  if (act_type == "sigmod") {
+  if (act_type == "sigmoid") {
     act_mode = 0;
   } else if (act_type == "relu") {
     act_mode = 1;
   } else if (act_type == "tanh") {
     act_mode = 2;
+  } else if (act_type == "relu_clipped" || act_type == "relu6") {
+    act_mode = 3;
   } else if (act_type == "elu") {
     act_mode = 4;
+  } else if (act_type == "leaky_relu") {
+    act_mode = 5;
   } else if (act_type == "abs") {
     act_mode = 6;
   } else if (act_type == "softsign") {
     act_mode = 8;
   } else if (act_type == "softplus") {
     act_mode = 9;
-  } else if (act_type == "hardsigmoid") {
+  } else if (act_type == "hard_sigmoid") {
     act_mode = 10;
   } else {
     // TODO(hong19860320) support more activation mode
@@ -165,24 +151,7 @@ int CvtActMode(std::string act_type) {
   return act_mode;
 }
 
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname) {
-  auto iarg_names = op_info->input_argnames();
-  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
-      iarg_names.end()) {
-    auto inputs = op_info->Input(argname);
-    if (inputs.empty()) {
-      return false;
-    }
-    auto var_name = inputs.front();
-    auto var = scope->FindVar(var_name);
-    return var != nullptr;
-  } else {
-    return false;
-  }
-}
-
 }  // namespace npu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/npu/bridges/utility.h b/lite/kernels/npu/bridges/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d7dc5891fa6821f926b232633dc40f26efb7a2e
--- /dev/null
+++ b/lite/kernels/npu/bridges/utility.h
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "graph/buffer.h"
+#include "graph/graph.h"
+#include "graph/model.h"
+#include "graph/op/all_ops.h"
+#include "graph/operator.h"
+#include "graph/operator_reg.h"
+#include "lite/core/op_lite.h"
+#include "lite/utils/macros.h"
+
+// Extended ops based on HIAI DDK
+namespace ge {
+/*
+ * Pads a tensor.
+ * <Input>
+ *    x : the input tensor
+ *    padding : the input tensor must be 2-D
+ *    constant_values : constant values must be a scalar
+ * <Output>
+ *    y : the output tensor
+ * <Attr>
+ *    mode : 0: CONSTANT, 1: REFLECT, 2: SYMMETRIC, 3:EDGE.
+ * <Added in HiAI version>
+ *    100.320.010.010
+ */
+REG_OP(Pad)
+    .INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
+    .INPUT(padding, TensorType({DT_INT32}))
+    .OPTIONAL_INPUT(constant_values, TensorType({DT_INT32, DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32}))
+    .ATTR(mode, AttrValue::INT{0})
+    .OP_END()
+
+    /*
+     * The operation pads input according to the paddings and constant_values.
+     * <Input>
+     *    x : The input tensor.
+     *    paddings : The values of paddings, as a role of dimensions to be added
+     * on the input tensor x, must be a Const-OP. constant_values : A tensor of
+     * the same type as x, that indicates the value to use for padding input,
+     *                      must be a Const-OP.
+     * <Output>
+     *    y : The output tensor.
+     * <Added in HiAI version>
+     *    100.320.010.010
+     */
+    REG_OP(PadV2)
+    .INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
+    .INPUT(paddings, TensorType({DT_INT32}))
+    .INPUT(constant_values, TensorType({DT_FLOAT, DT_INT32}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32}))
+    .OP_END()
+
+    /*
+     * Computes instance norm
+     * <Input>
+     *    x : Input tensor which supports 4D dimension format.
+     *    scale : A tesnor, multiple to result
+     *    bias : A tensor, add to result
+     * <Output>
+     *    y : Output tensor
+     * <Attr>
+     *    reduction_indices : The dimensions to reduce
+     *    epsilon : A very small float number used to avoid dividing by zero.
+     * <Added in HiAI version>
+     *    100.320.010.010
+     */
+    REG_OP(InstanceNorm)
+    .INPUT(x, TensorType({DT_FLOAT}))
+    .INPUT(scale, TensorType({DT_FLOAT}))
+    .INPUT(bias, TensorType({DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT}))
+    .REQUIRED_ATTR(reduction_indices, AttrValue::LIST_INT)
+    .ATTR(epsilon, AttrValue::FLOAT{1e-7f})
+    .OP_END()
+
+    /*
+     * Multiplies slices of two tensors in batches.
+     * <Input>
+     *      x : The input tensor
+     *      y : The input tensor
+     * <Output>
+     *      z : The output tensor
+     * <Attr>
+     *      adj_x : adj_x is true, the input tensor x  is  transposed, otherwise
+     * it will not be transposed. Default is false (The current version only
+     * supports false).
+     *      adj_y : adj_y is true, the input tensor y  is  transposed, otherwise
+     * it will not be transposed. Default is false.
+     * <Added in HiAI version>
+     *      100.320.010.010
+     */
+    REG_OP(BatchMatMul)
+    .INPUT(x, TensorType({DT_FLOAT}))
+    .INPUT(y, TensorType({DT_FLOAT}))
+    .OUTPUT(z, TensorType({DT_FLOAT}))
+    .ATTR(adj_x, AttrValue::BOOL{false})
+    .ATTR(adj_y, AttrValue::BOOL{false})
+    .OP_END()
+
+}  // namespace ge
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+// Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+ge::DataType CvtPrecisionType(PrecisionType itype);
+
+ge::Format CvtDataLayoutType(DataLayoutType itype);
+
+// Padding the shape to 4-dimensions(NCHW) for HiAI
+std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape);
+
+std::vector<int64_t> CvtShape(const DDim& in_dims);
+
+ge::TensorPtr CvtTensor(const Tensor& in_tensor,
+                        std::vector<int64_t> out_shape = {},
+                        DataLayoutType in_layout = DATALAYOUT(kNCHW));
+
+int CvtActMode(std::string act_type);
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/npu/graph_compute.cc b/lite/kernels/npu/graph_compute.cc
deleted file mode 100644
index 9a05a33062fa8f58c0f4bd96424d3fb20e457f4b..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/graph_compute.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/npu/graph_compute.h"
-#include <sys/time.h>
-#include <time.h>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-
-void GraphCompute::PrepareForRun() {
-  auto& ctx = this->ctx_->template As<NPUContext>();
-  auto& param = this->Param<param_t>();
-
-  // Load HiAI model from the weight tensor and release its buffer
-  // to save memory
-  CHECK(param.weight);
-  CHECK(lite::npu::LoadModel(*param.weight, &model_client_, &model_name_));
-  // TODO(hong19860320): find an good way to free the model data.
-  // No interface exists to free the data of tensor, so I resize the dim to 1
-  // and change target to force it to realloc a small size memory.
-  param.weight->Resize({1});
-  param.weight->mutable_data<int8_t>(TargetType::kARM);
-  CHECK(model_client_);
-
-  // Query the dimensions of NPU input and output tensors from HiAI model
-  std::vector<hiai::TensorDimension> npu_idims;
-  std::vector<hiai::TensorDimension> npu_odims;
-  int ret =
-      model_client_->GetModelIOTensorDim(model_name_, npu_idims, npu_odims);
-  CHECK_EQ(ret, hiai::AI_SUCCESS)
-      << "[NPU] Get the dimensions of input and output tensors failed.";
-
-  // Check whether the data sizes of NPU input and output tensors are the
-  // same as CPU's, then create and initialize NPU input and output tensors.
-  npu_itensors_.resize(npu_idims.size());
-  npu_otensors_.resize(npu_odims.size());
-  npu_idatasizes_.resize(npu_idims.size());
-  npu_odatasizes_.resize(npu_odims.size());
-  for (size_t i = 0; i < npu_idims.size(); ++i) {
-    auto cpu_itensor = param.inputs[i].second;
-    CHECK(cpu_itensor);
-    VLOG(3) << "[NPU] CPU input dims[" << i << "]: " << cpu_itensor->dims();
-    VLOG(3) << "[NPU] NPU input dims[" << i << "]: {"
-            << npu_idims[i].GetNumber() << "," << npu_idims[i].GetChannel()
-            << "," << npu_idims[i].GetHeight() << "," << npu_idims[i].GetWidth()
-            << "}";
-    npu_idatasizes_[i] = npu_idims[i].GetNumber() * npu_idims[i].GetChannel() *
-                         npu_idims[i].GetHeight() * npu_idims[i].GetWidth();
-    CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]);
-    npu_itensors_[i].reset(new hiai::AiTensor);
-    npu_itensors_[i]->Init(&(npu_idims[i]));
-  }
-  for (size_t i = 0; i < npu_odims.size(); ++i) {
-    auto cpu_otensor = param.outputs[i].second;
-    CHECK(cpu_otensor);
-    VLOG(3) << "[NPU] CPU output dims[" << i << "]: " << cpu_otensor->dims();
-    VLOG(3) << "[NPU] NPU output dims[" << i << "]: {"
-            << npu_odims[i].GetNumber() << "," << npu_odims[i].GetChannel()
-            << "," << npu_odims[i].GetHeight() << "," << npu_odims[i].GetWidth()
-            << "}";
-    npu_odatasizes_[i] = npu_odims[i].GetNumber() * npu_odims[i].GetChannel() *
-                         npu_odims[i].GetHeight() * npu_odims[i].GetWidth();
-    if (cpu_otensor->dims().production() != npu_odatasizes_[i]) {
-      cpu_otensor->Resize({npu_odims[i].GetNumber(),
-                           npu_odims[i].GetChannel(),
-                           npu_odims[i].GetHeight(),
-                           npu_odims[i].GetWidth()});
-    }
-    npu_otensors_[i].reset(new hiai::AiTensor);
-    npu_otensors_[i]->Init(&(npu_odims[i]));
-  }
-}
-
-void GraphCompute::Run() {
-  auto& param = this->Param<param_t>();
-
-  // Check whether the data sizes of NPU input tensors are the same as
-  // CPU's, and copy the data of CPU input tensors to NPU's.
-  CHECK_EQ(param.inputs.size(), npu_itensors_.size());
-  CHECK_EQ(param.outputs.size(), npu_otensors_.size());
-  for (size_t i = 0; i < param.inputs.size(); ++i) {
-    auto cpu_itensor = param.inputs[i].second;
-    CHECK(cpu_itensor);
-    CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]);
-    std::memcpy(static_cast<float*>(npu_itensors_[i]->GetBuffer()),
-                cpu_itensor->data<float>(),
-                sizeof(float) * static_cast<size_t>(npu_idatasizes_[i]));
-  }
-
-  // Run HiAI model with model name
-  std::string key = "model_name";  // Note: key seems must be model_name
-  model_context_.AddPara(key, model_name_);
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  int istamp;
-  auto start_time = GetCurrentUS();
-  CHECK_EQ(hiai::AI_SUCCESS,
-           model_client_->Process(
-               model_context_, npu_itensors_, npu_otensors_, 1000, istamp));
-  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
-
-  // Check whether the data sizes of NPU output tensors are the same as
-  // CPU's, and copy the data of NPU output tensors to CPU's.
-  for (size_t i = 0; i < param.outputs.size(); ++i) {
-    auto cpu_otensor = param.outputs[i].second;
-    CHECK(cpu_otensor);
-    CHECK_EQ(cpu_otensor->dims().production(), npu_odatasizes_[i]);
-    std::memcpy(cpu_otensor->mutable_data<float>(),
-                static_cast<float*>(npu_otensors_[i]->GetBuffer()),
-                sizeof(float) * static_cast<size_t>(npu_odatasizes_[i]));
-  }
-}
-
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(graph_op,
-                     kNPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::npu::GraphCompute,
-                     def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eaa4e95b3c9933573f5a947b0e6623f33e8d715b
--- /dev/null
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -0,0 +1,239 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "hiai_ir_build.h"  // NOLINT
+#include "lite/backends/npu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+
+int SubgraphEngine::BuildDeviceProgram() {
+  int status = 0;
+  // Convert all of ops and their input vars and weights and added into the NPU
+  // HiAI IR graph
+  subgraph::npu::Graph graph;
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = inst.op();
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kNPU))) {
+      return subgraph::FAILED;
+    }
+    auto kernel = inst.kernel();
+    status |=
+        bridges.Select(op_type, TARGET(kNPU))(reinterpret_cast<void*>(&graph),
+                                              const_cast<OpLite*>(op),
+                                              const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  // Collect the valid input and output nodes in the HiAI IR graph and update
+  // the input and output names
+  device_inames_.clear();
+  device_onames_.clear();
+  std::vector<ge::Operator> device_inodes;
+  std::vector<ge::Operator> device_onodes;
+  for (auto& input_name : input_names_) {
+    if (graph.Has(input_name)) {
+      if (graph.Get(input_name)->is_data()) {
+        device_inodes.push_back(*graph.Get(input_name)->data());
+        device_inames_.push_back(input_name);
+      } else {
+        LOG(WARNING) << "[NPU] Input node " << input_name
+                     << " is ignored because it is not a data node.";
+      }
+    } else {
+      LOG(WARNING) << "[NPU] Input node " << input_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+  for (auto& output_name : output_names_) {
+    if (graph.Has(output_name)) {
+      device_onodes.push_back(*graph.Get(output_name)->data());
+      device_onames_.push_back(output_name);
+    } else {
+      LOG(WARNING) << "[NPU] Output node " << output_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+  CHECK(!device_inames_.empty())
+      << "[NPU] No input nodes found for building NPU model";
+  CHECK(!device_onames_.empty())
+      << "[NPU] No output nodes found for building NPU model";
+  // Build the HiAI IR graph to HiAI om model as the device program
+  device_program_ = lite::npu::Device::Global().Build(
+      model_name_, device_inodes, device_onodes);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[NPU] Build model failed!";
+    return subgraph::FAILED;
+  }
+
+  // Query and check the dimensions of valid input and output tensors
+  std::vector<hiai::TensorDimension> device_idims, device_odims;
+  if (device_program_->GetModelIOTensorDim(
+          model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
+    LOG(WARNING)
+        << "[NPU] Get the dimensions of input and output tensors failed!";
+    return subgraph::FAILED;
+  }
+  CHECK_EQ(device_idims.size(), device_inames_.size());
+  CHECK_EQ(device_odims.size(), device_onames_.size());
+  origin_idims_.resize(device_inames_.size());
+  origin_itensors_.resize(device_inames_.size());
+  device_itensors_.resize(device_inames_.size());
+  origin_odims_.resize(device_onames_.size());
+  origin_otensors_.resize(device_onames_.size());
+  device_otensors_.resize(device_onames_.size());
+  for (int i = 0; i < device_inames_.size(); i++) {
+    auto node = graph.Get(device_inames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+    VLOG(3) << "[NPU] Inputs[" << i << "] name: " << device_inames_[i]
+            << " precision: " << PrecisionToStr(precision)
+            << " layout: " << DataLayoutToStr(layout) << " dims: {"
+            << device_idims[i].GetNumber() << ","
+            << device_idims[i].GetChannel() << ","
+            << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth()
+            << "}";
+    // Prepare the device input tensors
+    CHECK_EQ(origin_idims_[i].production(),
+             device_idims[i].GetNumber() * device_idims[i].GetChannel() *
+                 device_idims[i].GetHeight() * device_idims[i].GetWidth());
+    device_itensors_[i].reset(new hiai::AiTensor);
+    device_itensors_[i]->Init(&(device_idims[i]));
+  }
+  for (int i = 0; i < device_onames_.size(); i++) {
+    auto node = graph.Get(device_onames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    VLOG(3) << "[NPU] Outputs[" << i << "] name: " << device_onames_[i]
+            << " precision: " << PrecisionToStr(precision)
+            << " layout: " << DataLayoutToStr(layout) << " dims: {"
+            << device_odims[i].GetNumber() << ","
+            << device_odims[i].GetChannel() << ","
+            << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth()
+            << "}";
+    // Prepare the device output tensors
+    switch (precision) {
+      case PRECISION(kFloat):
+        origin_otensors_[i]->mutable_data<float>();
+        break;
+      case PRECISION(kInt8):
+        origin_otensors_[i]->mutable_data<int8_t>();
+        break;
+      case PRECISION(kInt16):
+        origin_otensors_[i]->mutable_data<int16_t>();
+        break;
+      case PRECISION(kInt32):
+        origin_otensors_[i]->mutable_data<int32_t>();
+        break;
+      case PRECISION(kInt64):
+        origin_otensors_[i]->mutable_data<int64_t>();
+        break;
+      default:
+        LOG(FATAL) << "[NPU] " << device_onames_[i]
+                   << " can't mutable data with precision type "
+                   << PrecisionToStr(precision);
+        break;
+    }
+    CHECK_EQ(origin_odims_[i].production(),
+             device_odims[i].GetNumber() * device_odims[i].GetChannel() *
+                 device_odims[i].GetHeight() * device_odims[i].GetWidth());
+    device_otensors_[i].reset(new hiai::AiTensor);
+    device_otensors_[i]->Init(&(device_odims[i]));
+  }
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  // Copy the data of origin input tensors to the buffer of input HiAI tensors
+  for (size_t i = 0; i < device_itensors_.size(); i++) {
+    std::memcpy(device_itensors_[i]->GetBuffer(),
+                origin_itensors_[i]->raw_data(),
+                origin_itensors_[i]->memory_size());
+  }
+  // Run the HiAI model by name
+  std::string key = "model_name";  // Note: key seems must be model_name
+  model_context_.AddPara(key, model_name_);
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  int istamp;
+  auto start_time = GetCurrentUS();
+  CHECK_EQ(
+      device_program_->Process(
+          model_context_, device_itensors_, device_otensors_, 1000, istamp),
+      hiai::AI_SUCCESS);
+  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
+  // Copy the data of output HiAI tensor to the buffer of origin output tensors
+  for (size_t i = 0; i < device_otensors_.size(); i++) {
+    std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
+                device_otensors_[i]->GetBuffer(),
+                device_otensors_[i]->GetSize());
+  }
+  return 0;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kNPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::npu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cdc4a0e62fe748a8b1d1dfb8f90c17b1d36e869
--- /dev/null
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "HiAiModelManagerService.h"
+#include "lite/core/kernel.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+
+  std::string model_name_;
+  hiai::AiContext model_context_;
+  std::vector<std::string> device_inames_;
+  std::vector<std::string> device_onames_;
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_;
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_;
+  std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr};
+};
+
+class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index 99b23c19f0f5870102782f0b4d639f6103257c31..f87b37fc62343b00aedd92fc7c30de3ea42c3c9d 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -2,32 +2,42 @@ if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
     return ()
 endif()
 
-set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper)
+set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter)
 
 add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(fusion_elementwise_add_activation_opencl
-        OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc
-        DEPS elementwise_add_opencl ${cl_kernel_deps})
+           OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc
+           DEPS elementwise_add_opencl ${cl_kernel_deps})
 add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
 add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(sigmoid_opencl OPENCL basic SRCS sigmoid_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(concat_opencl OPENCL basic SRCS concat_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(scale_opencl OPENCL basic SRCS scale_compute.cc DEPS ${cl_kernel_deps})
 
 lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
-        DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
+lite_cc_test(test_elementwise_mul_opencl SRCS elementwise_mul_compute_test.cc
+             DEPS elementwise_mul_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc
-        DEPS pool_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             DEPS pool_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
-        DEPS fc_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             DEPS fc_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 # TODO(ysh329): comment for buffer-impl mul
 #lite_cc_test(test_mul_opencl SRCS mul_compute_test.cc
@@ -35,22 +45,50 @@ lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
 #        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc
-        DEPS io_copy_compute_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             DEPS io_copy_compute_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 #TODO(ysh329): comment buffer-impl relu
-#lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
-#        DEPS relu_opencl op_registry program context
-#        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
+             DEPS relu_opencl layout_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
-lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
-        DEPS depthwise_conv2d_opencl op_registry program context
+lite_cc_test(test_sigmoid_opencl SRCS sigmoid_compute_test.cc
+        DEPS sigmoid_opencl layout_opencl op_registry program context
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
+lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
+             DEPS depthwise_conv2d_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
+lite_cc_test(test_depthwise_conv2d_image2d_opencl SRCS depthwise_conv2d_image2d_compute_test.cc
+             DEPS conv_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
+lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
+             DEPS reshape_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
 lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
-        DEPS conv_opencl op_registry program context
+             DEPS conv_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
+lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
+        DEPS conv_opencl op_registry program context cl_image_converter
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
-        DEPS layout_opencl op_registry program context
+        DEPS layout_opencl op_registry program context cl_image_converter
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
+lite_cc_test(test_concat_opencl SRCS concat_compute_test.cc
+        DEPS concat_opencl layout_opencl op_registry program context
+         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+         
+lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
+        DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter
+        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
+lite_cc_test(test_scale_opencl SRCS scale_compute_test.cc
+             DEPS scale_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
diff --git a/lite/kernels/opencl/concat_compute.cc b/lite/kernels/opencl/concat_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c57602e39aea27250eabfcf7a0570d80d7ff3dc4
--- /dev/null
+++ b/lite/kernels/opencl/concat_compute.cc
@@ -0,0 +1,372 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/opencl/concat_compute.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+template <>
+void ConcatCompute<PRECISION(kFloat),
+                   DATALAYOUT(kImageDefault)>::PrepareForRun() {
+  auto& context = ctx_->As<OpenCLContext>();
+  concat_param_ = param_.get_mutable<param_t>();
+  if (concat_param_->x.size() == 2) {
+    kernel_func_name_ = "concat2";
+  } else {
+    kernel_func_name_ = "concat_mul";
+  }
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "image/concat_kernel.cl", build_options_);
+  // UpdateParams<kFloat, kImageDefault>();
+  auto axis = concat_param_->axis;
+  auto inputs = concat_param_->x;
+  auto out_dims = concat_param_->output->dims();
+  auto* axis_tensor = concat_param_->axis_tensor;
+  if (axis_tensor != nullptr) {
+    // auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
+    // axis = axis_tensor_data[0];
+  }
+  auto in_dims = inputs[0]->dims();
+  axis_size_ = out_dims[axis];
+  axis_ = axis;
+  for (int i = 0; i < axis; i++) {
+    pre_size_ *= in_dims[i];
+  }
+  for (int i = axis + 1; i < in_dims.size(); i++) {
+    post_size_ *= in_dims[i];
+  }
+  for (int i = 1; i < inputs.size(); i++) {
+    auto dims = inputs[i]->dims();
+    // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
+    if (in_dims.size() != dims.size()) {
+      printf("input shape must be same \n");
+      return;
+    }
+    for (int i = 0; i < dims.size(); i++) {
+      if (i != axis) {
+        if (in_dims[i] != dims[i]) {
+          printf("input shape must be same \n");
+          return;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::Run() {
+  auto& param = *param_.get_mutable<param_t>();
+  const auto& x_dims = param.output->dims();
+  auto image_shape = InitImageDimInfoWith(x_dims);
+  auto* out_buf = param.output->mutable_data<float, cl::Image2D>(
+      image_shape["width"], image_shape["height"]);
+  const auto& y_dims = param.output->dims();  // useless: check dim only
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+
+  auto inputs = param.x;
+  int arg_idx = 0;
+  int width = inputs[0]->dims()[-1];
+  auto global_work_size =
+      cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                  static_cast<cl::size_type>(image_shape["height"])};
+  VLOG(4) << TargetToStr(param.output->target());
+  VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+          << image_shape["height"];
+  VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+          << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+  VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+          << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  int flag = 1;  // cxw
+  switch (axis_) {
+    case 0:
+      width = x_dims[2];  // n
+      flag = 0;
+      break;
+    case 1:
+      width = x_dims[3];  // c
+      break;
+    case 2:
+      width = x_dims[0];  // h
+      flag = 0;
+      break;
+    case 3:
+    case -1:
+      width = x_dims[1];  // w
+      break;
+    default:
+      printf("this axis: %d does not support \n", axis_);
+  }
+  if (inputs.size() == 2) {
+    auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
+    auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
+    cl_int status = kernel.setArg(arg_idx, *x_buf0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *x_buf1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status =
+        kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, flag);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, width);
+    CL_CHECK_FATAL(status);
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_context()->GetCommandQueue().finish();
+  } else {
+    auto start = 0;
+    for (int i = 0; i < inputs.size(); i++) {
+      arg_idx = 0;
+      auto* x_buf = inputs[i]->data<float, cl::Image2D>();
+      cl_int status = kernel.setArg(arg_idx, *x_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, axis_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, start);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, flag);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, width);
+      CL_CHECK_FATAL(status);
+      CL_CHECK_FATAL(status);
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_context()->GetCommandQueue().finish();
+      start += inputs[i]->dims()[axis_];
+    }
+  }
+}
+
+template <>
+std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::doc() {
+  return "Concat using cl::Image, kFloat";
+}
+
+template <>
+void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::PrepareForRun() {
+  auto& context = ctx_->As<OpenCLContext>();
+  concat_param_ = param_.get_mutable<param_t>();
+  if (concat_param_->x.size() == 2) {
+    kernel_func_name_ = "concat2";
+  } else {
+    kernel_func_name_ = "concat_mul";
+  }
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
+
+  //  UpdateParams<kFloat, kImageDefault>();
+  auto axis = concat_param_->axis;
+  auto inputs = concat_param_->x;
+  auto out_dims = concat_param_->output->dims();
+  auto* axis_tensor = concat_param_->axis_tensor;
+  if (axis_tensor != nullptr) {
+    //   auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
+    //  axis = axis_tensor_data[0];
+  }
+  auto in_dims = inputs[0]->dims();
+  axis_size_ = out_dims[axis];
+  axis_ = axis;
+  for (int i = 0; i < axis; i++) {
+    pre_size_ *= in_dims[i];
+  }
+  for (int i = axis + 1; i < in_dims.size(); i++) {
+    post_size_ *= in_dims[i];
+  }
+  for (int i = 1; i < inputs.size(); i++) {
+    auto dims = inputs[i]->dims();
+    if (in_dims.size() != dims.size()) {
+      printf("input shape must be same \n");
+      return;
+    }
+    for (int i = 0; i < dims.size(); i++) {
+      if (i != axis) {
+        if (in_dims[i] != dims[i]) {
+          printf("input shape must be same \n");
+          return;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::Run() {
+  auto& param = *param_.get_mutable<param_t>();
+  const auto& x_dims = param.output->dims();
+  auto image_shape = InitImageDimInfoWith(x_dims);
+  auto* out_buf =
+      param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  const auto& y_dims = param.output->dims();  // useless: check dim only
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+
+  auto inputs = param.x;
+  int arg_idx = 0;
+  auto global_work_size = cl::NDRange{axis_size_};
+  int total = axis_size_ * post_size_;
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  if (inputs.size() == 2) {
+    auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
+    auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
+    auto axis0 = inputs[0]->dims()[axis_];
+    int total0 = axis0 * post_size_;
+    int total1 = (axis_size_ - axis0) * post_size_;
+    cl_int status = kernel.setArg(arg_idx, *x_buf0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *x_buf1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, axis_size_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, pre_size_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, post_size_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, total);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, total0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, total1);
+    CL_CHECK_FATAL(status);
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_buf, event_);
+  } else {
+    auto start = 0;
+    for (int i = 0; i < inputs.size(); i++) {
+      arg_idx = 0;
+      int size = inputs[i]->dims()[axis_];
+      auto* x_buf = inputs[i]->data<float, cl::Buffer>();
+      global_work_size = cl::NDRange{static_cast<size_t>(size)};
+      int total0 = size * post_size_;
+      cl_int status = kernel.setArg(arg_idx, *x_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<int>(size));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, pre_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, post_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, start);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total0);
+      CL_CHECK_FATAL(status);
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_wait_list()->emplace(out_buf, event_);
+      start += size;
+    }
+  }
+}
+
+template <>
+std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::doc() {
+  return "Concat using cl::Buffer, kFloat";
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
+                                                     DATALAYOUT(kNCHW)>
+    Concat_buffer;
+
+typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
+                                                     DATALAYOUT(kImageDefault)>
+    Concat_image;
+
+REGISTER_LITE_KERNEL(
+    concat, kOpenCL, kFloat, kImageDefault, Concat_image, ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+// REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
+//     .BindInput("X",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFloat),
+//                                       DATALAYOUT(kNCHW))})
+//     .BindInput("AxisTensor",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kInt32),
+//                                       DATALAYOUT(kNCHW))})
+//     .BindOutput("Out",
+//                 {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                        PRECISION(kFloat),
+//                                        DATALAYOUT(kNCHW))})
+//     .Finalize();
diff --git a/lite/kernels/opencl/concat_compute.h b/lite/kernels/opencl/concat_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bed6a18146d76043fbfcd72236ba39c5607328b
--- /dev/null
+++ b/lite/kernels/opencl/concat_compute.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/kernel.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+template <PrecisionType Ptype, DataLayoutType layout>
+class ConcatCompute : public KernelLite<TARGET(kOpenCL), Ptype, layout> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  std::string doc();  // override;
+
+  // protected:
+  // void UpdateParams();
+
+  int axis_size_ = 1;
+  int post_size_ = 1;
+  int pre_size_ = 1;
+  int axis_ = 1;
+  param_t* concat_param_{nullptr};
+  std::string kernel_func_name_{};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/opencl/concat_compute_test.cc b/lite/kernels/opencl/concat_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9af0666cc9bdef184654a026bbfb6004c2ccdd18
--- /dev/null
+++ b/lite/kernels/opencl/concat_compute_test.cc
@@ -0,0 +1,390 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// //
+// // Licensed under the Apache License, Version 2.0 (the "License");
+// // you may not use this file except in compliance with the License.
+// // You may obtain a copy of the License at
+// //
+// //     http://www.apache.org/licenses/LICENSE-2.0
+// //
+// // Unless required by applicable law or agreed to in writing, software
+// // distributed under the License is distributed on an "AS IS" BASIS,
+// // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// // See the License for the specific language governing permissions and
+// // limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void concat2_compute_ref(const dtype *in0,
+                         const dtype *in1,
+                         const int axis,
+                         const DDim in0_dim,
+                         const DDim in1_dim,
+                         const DDim out_dim,
+                         dtype *out_data) {
+  int pre_size = 1;
+  int post_size = 1;
+  for (int i = 0; i < axis; i++) {
+    pre_size *= in0_dim[i];
+  }
+  for (int i = axis + 1; i < in0_dim.size(); i++) {
+    post_size *= in0_dim[i];
+  }
+  int axis_size = out_dim[axis];
+  for (int i = 0; i < pre_size; i++) {
+    for (int j = 0; j < axis_size; j++) {
+      if (j < in0_dim[axis]) {
+        memcpy(out_data, in0, sizeof(dtype) * post_size);
+        in0 += post_size;
+        out_data += post_size;
+      }
+    }
+  }
+}
+
+template <typename dtype>
+void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
+                            std::vector<const DDim> ins_dim,
+                            int axis,
+                            const DDim out_dim,
+                            dtype *out_data) {
+  int pre_size = 1;
+  int post_size = 1;
+  for (int i = 0; i < axis; i++) {
+    pre_size *= ins_dim[0][i];
+  }
+  for (int i = axis + 1; i < ins_dim[0].size(); i++) {
+    post_size *= ins_dim[0][i];
+  }
+  int axis_size = out_dim[axis];
+  for (int i = 0; i < pre_size; i++) {
+    for (int j = 0; j < ins_data.size(); j++) {
+      int size = post_size * ins_dim[j][axis];
+      memcpy(out_data, ins_data[j], sizeof(dtype) * size);
+      out_data += size;
+    }
+  }
+}
+#if 0   // concat_buffer
+TEST(opencl_concat_buffer, compute) {
+  // prepare data
+  const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
+  lite::Tensor x0, x1, x2, out, out_ref;
+  x0.Resize(x0_dim);
+  x1.Resize(x1_dim);
+  x2.Resize(x2_dim);
+  out.Resize(out_dim);
+  out_ref.Resize(out_dim);
+
+  auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-10, 10);
+  auto *mapped_x0 = static_cast<float *>(
+      TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
+  auto *mapped_x1 = static_cast<float *>(
+      TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
+  auto *mapped_x2 = static_cast<float *>(
+      TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
+  for (int i = 0; i < x0_dim.production(); i++) {
+    mapped_x0[i] = dist(engine);
+  }
+  for (int i = 0; i < x1_dim.production(); i++) {
+    mapped_x1[i] = dist(engine);
+  }
+  for (int i = 0; i < x2_dim.production(); i++) {
+    mapped_x2[i] = dist(engine);
+  }
+
+  // set param and kernel, then run
+  operators::ConcatParam param;
+  std::vector<lite::Tensor *> ins;
+  ins.push_back(&x0);
+  ins.push_back(&x1);
+  ins.push_back(&x2);
+  auto axis = 1;
+  param.x = ins;
+  param.output = &out;
+  param.axis = axis;
+
+  std::vector<const float *> ins_data;
+  std::vector<const DDim> ins_dim;
+
+  ins_data.push_back(mapped_x0);
+  ins_data.push_back(mapped_x1);
+  ins_data.push_back(mapped_x2);
+  ins_dim.push_back(x0_dim);
+  ins_dim.push_back(x1_dim);
+  ins_dim.push_back(x2_dim);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  auto kernels = KernelRegistry::Global().Create(
+      "concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> concat_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(concat_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(concat_context));
+  kernel->Launch();
+
+  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto *out_ptr = param.output->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto &event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  // run compute ref and check
+  auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+  concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
+
+  auto *out_data = out.mutable_data<float, cl::Buffer>();
+  auto *mapped_out = static_cast<float *>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+  TargetWrapperCL::Unmap(x0_data, mapped_x0);
+  TargetWrapperCL::Unmap(x1_data, mapped_x1);
+  TargetWrapperCL::Unmap(x2_data, mapped_x2);
+}
+#endif  // concat_buffer
+
+// #define LOOP_TEST
+// #define PRINT_RESULT
+TEST(concat_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+          for (atuo &axis : {0, 1, 2, 3}) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+  const int axis = 1;
+#endif  // LOOP_TEST
+            LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c
+                      << " " << h << " " << w << " ========";
+            LOG(INFO) << "======== axis: " << axis;
+            // set layout kernels
+            auto buf_to_img_kernels =
+                KernelRegistry::Global().Create("layout",
+                                                TARGET(kOpenCL),
+                                                PRECISION(kAny),
+                                                DATALAYOUT(kImageDefault));
+            auto buf_to_img_kernels1 =
+                KernelRegistry::Global().Create("layout",
+                                                TARGET(kOpenCL),
+                                                PRECISION(kAny),
+                                                DATALAYOUT(kImageDefault));
+            auto img_to_buf_kernels = KernelRegistry::Global().Create(
+                "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+            auto concat_img_kernels =
+                KernelRegistry::Global().Create("concat",
+                                                TARGET(kOpenCL),
+                                                PRECISION(kFloat),
+                                                DATALAYOUT(kImageDefault));
+            ASSERT_FALSE(buf_to_img_kernels.empty());
+            ASSERT_FALSE(buf_to_img_kernels1.empty());
+            ASSERT_FALSE(img_to_buf_kernels.empty());
+            ASSERT_FALSE(concat_img_kernels.empty());
+
+            auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+            auto buf_to_img_kernel1 = std::move(buf_to_img_kernels1.front());
+            auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+            auto concat_img_kernel = std::move(concat_img_kernels.front());
+            LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+            LOG(INFO) << "get 1st-1 kernel: " << buf_to_img_kernel1->doc();
+            LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+            LOG(INFO) << "get 3rd kernel: " << concat_img_kernel->doc();
+
+            // set tensors about op param
+            LOG(INFO) << "set tensors about op param";
+            lite::Tensor x0, x1, y, concat_in0, concat_in1, concat_out, y_ref;
+            operators::LayoutParam BufferToImageParam0, BufferToImageParam1;
+            operators::LayoutParam ImageToBufferParam;
+            BufferToImageParam0.x = &x0;
+            BufferToImageParam0.y = &concat_in0;
+            BufferToImageParam1.x = &x1;
+            BufferToImageParam1.y = &concat_in1;
+            ImageToBufferParam.x = &concat_out;
+            ImageToBufferParam.y = &y;
+            std::vector<lite::Tensor *> ins;
+            operators::ConcatParam concatParam;
+            ins.push_back(&concat_in0);
+            ins.push_back(&concat_in1);
+            concatParam.x = ins;
+            concatParam.axis = axis;
+            concatParam.output = &concat_out;
+
+            const DDim x0_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+            DDim x1_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+            DDim out_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+            x1_dim[axis] += 2;
+            out_dim[axis] = x0_dim[axis] + x1_dim[axis];
+            x0.Resize(x0_dim);
+            x1.Resize(x1_dim);
+            y.Resize(out_dim);
+            concat_in0.Resize(x0_dim);
+            concat_in1.Resize(x1_dim);
+            concat_out.Resize(out_dim);
+            y_ref.Resize(out_dim);
+            auto concat_image2d_shape =
+                paddle::lite::kernels::opencl::InitImageDimInfoWith(out_dim);
+            auto concat_image2d_shape_in0 =
+                paddle::lite::kernels::opencl::InitImageDimInfoWith(x0_dim);
+            auto concat_image2d_shape_in1 =
+                paddle::lite::kernels::opencl::InitImageDimInfoWith(x1_dim);
+
+            // initialize tensors
+            LOG(INFO) << "initialize tensors";
+            auto *x_data0 = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+            auto *x_data1 = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+            auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+            auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+            auto *mapped_x0 = static_cast<float *>(TargetWrapperCL::Map(
+                x_data0, 0, sizeof(float) * x0_dim.production()));
+            auto *mapped_x1 = static_cast<float *>(TargetWrapperCL::Map(
+                x_data1, 0, sizeof(float) * x1_dim.production()));
+            auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+                y_data, 0, sizeof(float) * out_dim.production()));
+            for (int i = 0; i < x0_dim.production(); ++i) {
+              mapped_x0[i] = static_cast<int>(i) - x0_dim.production() / 2;
+            }
+            for (int i = 0; i < x1_dim.production(); ++i) {
+              mapped_x1[i] = static_cast<int>(i) - x1_dim.production() / 2;
+            }
+            for (int i = 0; i < out_dim.production(); ++i) {
+              mapped_y[i] = static_cast<int>(0);
+            }
+            auto *concat_in_data0 = concat_in0.mutable_data<float, cl::Image2D>(
+                concat_image2d_shape_in0["width"],
+                concat_image2d_shape_in0["height"]);
+            auto *concat_in_data1 = concat_in1.mutable_data<float, cl::Image2D>(
+                concat_image2d_shape_in1["width"],
+                concat_image2d_shape_in1["height"]);
+            auto *concat_out_data = concat_out.mutable_data<float, cl::Image2D>(
+                concat_image2d_shape["width"], concat_image2d_shape["height"]);
+
+            // set context and kernel args
+            LOG(INFO) << "set context and kernel args";
+            std::unique_ptr<KernelContext> context(new KernelContext);
+            context->As<OpenCLContext>().InitOnce();
+
+            buf_to_img_kernel->SetParam(BufferToImageParam0);
+            std::unique_ptr<KernelContext> buf_to_img_context(
+                new KernelContext);
+            context->As<OpenCLContext>().CopySharedTo(
+                &(buf_to_img_context->As<OpenCLContext>()));
+            buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+            buf_to_img_kernel1->SetParam(BufferToImageParam1);
+            std::unique_ptr<KernelContext> buf_to_img_context1(
+                new KernelContext);
+            context->As<OpenCLContext>().CopySharedTo(
+                &(buf_to_img_context1->As<OpenCLContext>()));
+            buf_to_img_kernel1->SetContext(std::move(buf_to_img_context1));
+
+            img_to_buf_kernel->SetParam(ImageToBufferParam);
+            std::unique_ptr<KernelContext> img_to_buf_context(
+                new KernelContext);
+            context->As<OpenCLContext>().CopySharedTo(
+                &(img_to_buf_context->As<OpenCLContext>()));
+            img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+            concat_img_kernel->SetParam(concatParam);
+            std::unique_ptr<KernelContext> concat_img_context(
+                new KernelContext);
+            context->As<OpenCLContext>().CopySharedTo(
+                &(concat_img_context->As<OpenCLContext>()));
+            concat_img_kernel->SetContext(std::move(concat_img_context));
+
+            // run kernels
+            LOG(INFO) << "run kernel: buf_to_img_kernel";
+            buf_to_img_kernel->Launch();
+            buf_to_img_kernel1->Launch();
+            LOG(INFO) << "run kernel: concat_img_kernel";
+            concat_img_kernel->Launch();
+            LOG(INFO) << "run kernel: img_to_buf_kernel";
+            img_to_buf_kernel->Launch();
+
+            // compute ref cp_u
+            std::vector<const float *> ins_ptr;
+            std::vector<const DDim> in_dim;
+            ins_ptr.push_back(mapped_x0);
+            ins_ptr.push_back(mapped_x1);
+            in_dim.push_back(x0_dim);
+            in_dim.push_back(x1_dim);
+            concat_mul_compute_ref<float>(
+                ins_ptr, in_dim, axis, out_dim, y_data_ref);
+// result
+#ifdef PRINT_RESULT
+            LOG(INFO) << "---- print kernel result (input -> output) ----";
+            for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
+              std::cout << mapped_x0[eidx] << ", " << mapped_x1[eidx] << " -> "
+                        << mapped_y[eidx] << std::endl;
+            }
+#endif  // PRINT_RESULT
+
+            // check result: compare kernel output and cpu output(y_data_ref)
+            for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+              EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+              if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+                LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                          << " / " << x0_dim.production() << ", y_data_ref["
+                          << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                          << eidx << "]:" << mapped_y[eidx];
+                break;
+              }
+            }
+            // free
+            LOG(INFO) << "free: unmap x, y";
+            TargetWrapperCL::Unmap(x_data0, mapped_x0);
+            TargetWrapperCL::Unmap(x_data1, mapped_x1);
+            TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+          }  // axis
+        }    // w
+      }      // h
+    }        // c
+  }          // n
+#else
+// nothing to do.
+#endif
+}
+}  // namespace lite
+}  // namespace paddle
+
+// concat buffer
+// USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
+
+// concat image2d fp32
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+USE_LITE_KERNEL(concat, kOpenCL, kFloat, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_compute.cc
index 04a78face2b9c07c42aceb53f0f797ded46e59d9..d00101552d4376bc4ac2a176016c1a9a449c35a7 100644
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
@@ -13,9 +13,13 @@
 // limitations under the License.
 
 #include "lite/kernels/opencl/conv_compute.h"
+
 #include <sstream>
+
+#include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 
 namespace paddle {
@@ -38,15 +42,20 @@ void ConvCompute::PrepareForRun() {
   int w_out = output_dims[3];
   int kernel_h = filter_dims[2];  // oihw
   int kernel_w = filter_dims[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int groups = param.groups;
   bool relu_fused = param.fuse_relu;
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
   bool zero_pad = (pad_h == 0) && (pad_w == 0);
 
+  bool pad_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+
   VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
   VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
           << " stride_w:" << stride_w << " pad_h:" << pad_h
@@ -60,31 +69,40 @@ void ConvCompute::PrepareForRun() {
           << filter_dims[2] << " " << filter_dims[3];
 
   if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 &&
-      zero_pad && no_dilation) {
+      zero_pad && no_dilation && pad_equal) {
     // conv2d_1x1
     kernel_func_names_.push_back("gemm_batch");
     kernel_func_paths_.push_back("buffer/fc_kernel.cl");
     if (relu_fused) {
-      build_options_.push_back("-DCL_DTYPE=float -DRELU");
+      build_options_.push_back("-DCL_DTYPE_float -DRELU");
+    } else if (param.activation_param.active_type ==
+               lite_api::ActivationType::kRelu6) {
+      build_options_.push_back("-DCL_DTYPE_float -DRELU6");
     } else {
-      build_options_.push_back("-DCL_DTYPE=float");
+      build_options_.push_back("-DCL_DTYPE_float");
     }
     impl_ = &ConvCompute::Conv2d1x1;
-  } else {
+  } else if (pad_equal) {
     kernel_func_names_.push_back("im2col");
     kernel_func_names_.push_back("gemm_batch");
     kernel_func_paths_.push_back("buffer/im2col_kernel.cl");
     kernel_func_paths_.push_back("buffer/fc_kernel.cl");
-    build_options_.push_back("-DCL_DTYPE=float");
+    build_options_.push_back("-DCL_DTYPE_float");
     if (relu_fused) {
-      build_options_.push_back("-DCL_DTYPE=float -DRELU");
+      build_options_.push_back("-DCL_DTYPE_float -DRELU");
+    } else if (param.activation_param.active_type ==
+               lite_api::ActivationType::kRelu6) {
+      build_options_.push_back("-DCL_DTYPE_float -DRELU6");
     } else {
-      build_options_.push_back("-DCL_DTYPE=float");
+      build_options_.push_back("-DCL_DTYPE_float");
     }
     impl_ = &ConvCompute::GemmlikeConv2d;
     col_buffer_.reset(new lite::Tensor);
     col_buffer_->Resize({bs, c_in, kernel_h * kernel_w, h_out * w_out});
     col_buffer_->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  } else {
+    LOG(FATAL) << "This pad not support ! " << paddings[0] << ", "
+               << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
   }
 
   for (size_t i = 0; i < kernel_func_names_.size(); i++) {
@@ -102,17 +120,19 @@ void ConvCompute::GemmlikeConv2d() {
   int c_in = x_dims[1];
   int h_in = x_dims[2];
   int w_in = x_dims[3];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int c_out = output_dims[1];
   int h_out = output_dims[2];
   int w_out = output_dims[3];
   int kernel_h = filter_dims[2];
   int kernel_w = filter_dims[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
-  int dilation_h = param.dilations[0];
-  int dilation_w = param.dilations[1];
+  int dilation_h = dilations[0];
+  int dilation_w = dilations[1];
 
   auto* x_buf = param.x->data<float, cl::Buffer>();
   auto* filter_buf = param.filter->data<float, cl::Buffer>();
@@ -226,7 +246,6 @@ void ConvCompute::Conv2d1x1() {
 
   GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
 }
-
 // a: filter_d ==> <m, k> <=> <oc, ic>
 // b: x_d      ==> <k, n> <=> <ic, ih*iw>
 // c: output_d ==> <m, n> <=> <oc, ih*iw>
@@ -278,19 +297,1184 @@ void ConvCompute::GemmBatched(cl::Kernel& kernel,
 
 void ConvCompute::Run() { (this->*impl_)(); }
 
+/* image kernel*/
+void ConvImageCompute::PrepareForRun() {
+  const auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  float* filter_cpu = param.filter->mutable_data<float>();
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+
+  int bs = x_dims[0];
+  int c_in = x_dims[1];
+  int h_out = output_dims[2];
+  int w_out = output_dims[3];
+  int kernel_h = filter_dims[2];  // oihw
+  int kernel_w = filter_dims[3];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
+  int groups = param.groups;
+  bool relu_fused = param.fuse_relu;
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  bool zero_pad = (pad_h == 0) && (pad_w == 0);
+
+  bool pad_equal =
+      ((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) &&
+       (paddings[2] == paddings[3]));
+  bool stride_equal = stride_h == stride_w;
+  bool dilation_equal = dilations[0] == dilations[1];
+
+  CHECK(pad_equal && stride_equal && dilation_equal);
+
+  VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
+  VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
+          << " stride_w:" << stride_w << " pad_h:" << pad_h
+          << " pad_w:" << pad_w << " kernel_h:" << kernel_h
+          << " kernel_h:" << kernel_h;
+  VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
+          << " " << x_dims[3];
+  VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
+          << output_dims[2] << " " << output_dims[3];
+  VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
+          << filter_dims[2] << " " << filter_dims[3];
+  if (kernel_h == 1 && kernel_w == 1) {
+    // conv2d_1x1
+    if (param.x->dims()[1] % 4 == 0) {
+      kernel_func_names_.push_back("conv2d_1x1_simple");
+    } else {
+      kernel_func_names_.push_back("conv2d_1x1");
+    }
+    kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl");
+
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d1x1;
+#if 1  // TODO(ysh329): enable general dwconv
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]) {
+#else  // TODO(ysh329): remove dwconv3x3s1 and dwconv3x3 temporarily, need fix
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
+             kernel_h == 3 && kernel_w == 3 && groups > 1) {
+    // depth_conv2d_3x3s1, depth_conv2d_3x3
+    if (stride_h == 1 && dilations[0] == 1) {
+      kernel_func_names_.push_back("depth_conv2d_3x3s1");
+      impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
+    } else {
+      kernel_func_names_.push_back("depth_conv2d_3x3");
+      impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
+    }
+    kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
+
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
+             kernel_h != 3) {
+#endif
+    // depth_conv2d
+    kernel_func_names_.push_back("depth_conv2d");
+    kernel_func_paths_.push_back("image/depthwise_conv2d_basic_kernel.cl");
+
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::DepthwiseConv2d;
+  } else if (kernel_h == 3 && kernel_h == 3) {
+    // conv2d_3x3
+    kernel_func_names_.push_back("conv2d_3x3");
+    kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d3x3;
+  } else if (kernel_h == 5 && kernel_w == 5) {
+    // conv2d_5x5
+    kernel_func_names_.push_back("conv2d_5x5");
+    kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d5x5;
+  } else if (kernel_h == 7 && kernel_w == 7) {
+    // conv2d_7x7
+    kernel_func_names_.push_back("conv2d_7x7");
+    kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    this->filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d7x7;
+  } else {
+    LOG(FATAL) << "conv image compute not support this condition yet! ";
+  }
+  VLOG(1) << "kernel_func_names_[0]:" << kernel_func_names_[0]
+          << " kernel_func_paths_[0]:" << kernel_func_paths_[0];
+
+  std::string build_options_single(" -DCL_DTYPE_float");
+  // relu options
+  if (relu_fused) {
+    build_options_single += " -DRELU";
+  } else if (param.activation_param.active_type ==
+             lite_api::ActivationType::kRelu6) {
+    build_options_single += " -DRELU6";
+  } else {
+    // do nothing
+  }
+  // bias options
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  if (has_bias) {
+    build_options_single +=
+        is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
+
+    // convert cpu buffer bias --> gpu image
+    CLImageConverterFolder bias_converter;
+    const DDim& bias_image_dims =
+        bias_converter.InitImageDimInfoWith(param.bias->dims());
+    std::vector<float> bias_image_v(bias_image_dims[0] * bias_image_dims[1] *
+                                    4);
+    float* bias_cpu_data = param.bias->mutable_data<float>();
+    bias_converter.NCHWToImage(
+        bias_cpu_data, bias_image_v.data(), param.bias->dims());
+    this->bias_gpu_image_.mutable_data<float, cl::Image2D>(
+        bias_image_dims[0], bias_image_dims[1], bias_image_v.data());
+    // convert cpu buffer bias --> gpu image --- end ----
+  }
+
+  build_options_.push_back(build_options_single);
+
+  for (size_t i = 0; i < kernel_func_names_.size(); i++) {
+    context.cl_context()->AddKernel(
+        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+  }
+}
+
+void ConvImageCompute::Conv2d1x1() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ conv2d_1x1 params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  // handle bias  use buffer for channel wise , use image for element wise
+  const cl::Buffer* bias_buf = nullptr;
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  std::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  int maped_w = maptofactor(w, 4);
+
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "maped_w: " << maped_w;
+  VLOG(4) << "hasbias: " << has_bias;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, maped_w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(maped_w),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
+void ConvImageCompute::Conv2d3x3() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int input_channel = input_dims[1];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int output_channel = output_dims[1];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  int filter_channel = filter_dims[1];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  // re-calc group
+  int new_groups{param.groups};
+  if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
+    new_groups = 1;
+  } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
+    new_groups = input_channel / filter_channel;
+  }
+  /* TODO(ysh329): mobile has no case below
+     else {
+      LOG(FATAL) << "Not support conv3x3 case with"
+                 << " input_dims:" << input_dims << " output_dims:" <<
+    output_dims
+                 << " filter_dims:" << filter_dims;
+    }
+  */
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "param.groups(groups):" << param.groups;
+  VLOG(4) << "new_groups:" << new_groups;
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, new_groups);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
+void ConvImageCompute::Conv2d5x5() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
+void ConvImageCompute::Conv2d7x7() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
+void ConvImageCompute::DepthwiseConv2d3x3s1() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* input_img = param.x->data<float, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_.data<float, cl::Image2D>();
+
+  const cl::Image2D* bias_img = nullptr;
+  if (param.bias) {
+    bias_img = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto image_shape = InitImageDimInfoWith(output_dims);
+
+  auto* output_img = param.output->mutable_data<float, cl::Image2D>(
+      image_shape["width"], image_shape["height"]);
+
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+  int c_block = (output_dims[1] + 3) / 4;
+  int w = output_dims[3];
+  int nh = output_dims[0] * output_dims[2];
+
+  int w_blk_size = 2;
+  int w_blk = (w + w_blk_size - 1) / w_blk_size;
+
+  auto global_work_size = cl::NDRange(c_block, w_blk, nh);
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *output_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+  CL_CHECK_FATAL(status);
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(output_img, event_);
+}
+
+void ConvImageCompute::DepthwiseConv2d3x3() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
+  int offset = filter_dims[2] / 2 - paddings[0];
+  int input_c_block = (x_dims[1] + 3) / 4;
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* input_img = param.x->data<float, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_.data<float, cl::Image2D>();
+
+  const cl::Image2D* bias_img = nullptr;
+  if (param.bias) {
+    bias_img = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto image_shape = InitImageDimInfoWith(output_dims);
+
+  auto* output_img = param.output->mutable_data<float, cl::Image2D>(
+      image_shape["width"], image_shape["height"]);
+
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+  int c_block = (output_dims[1] + 3) / 4;
+  int w = output_dims[3];
+  int nh = output_dims[0] * output_dims[2];
+  auto global_work_size = cl::NDRange(c_block, w, nh);
+
+  VLOG(4) << "setArg";
+  VLOG(4) << "c_block = " << c_block;
+  VLOG(4) << "w = " << w;
+  VLOG(4) << "nh = " << nh;
+
+  VLOG(4) << "strides = " << strides[0];
+  VLOG(4) << "offset = " << offset;
+  VLOG(4) << "dilations = " << dilations[0];
+  VLOG(4) << "input_c_block = " << input_c_block;
+  VLOG(4) << "x_dims[3] = " << x_dims[3];
+  VLOG(4) << "x_dims[2] = " << x_dims[2];
+  VLOG(4) << "output_dims[3] = " << output_dims[3];
+  VLOG(4) << "output_dims[2] = " << output_dims[2];
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(w));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *output_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+  CL_CHECK_FATAL(status);
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(output_img, event_);
+}
+
+void ConvImageCompute::DepthwiseConv2d() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ depthwise conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  // handle bias  use buffer for channel wise , use image for element wise
+  const cl::Buffer* bias_buf = nullptr;
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_height);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
+void ConvImageCompute::Run() { (this->*impl_)(); }
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
+// REGISTER_LITE_KERNEL(conv2d,
+//                      kOpenCL,
+//                      kFloat,
+//                      kNCHW,
+//                      paddle::lite::kernels::opencl::ConvCompute,
+//                      def)
+//     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .Finalize();
+
 REGISTER_LITE_KERNEL(conv2d,
                      kOpenCL,
                      kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::opencl::ConvCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ConvImageCompute,
+                     image2d)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(depthwise_conv2d,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ConvImageCompute,
+                     image2d)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/conv_compute.h b/lite/kernels/opencl/conv_compute.h
index 37c8893bb8420d782bf546dec250eba40dbe3c6d..672ba9d223031edf1ebc3d955908c4ab8edc0834 100644
--- a/lite/kernels/opencl/conv_compute.h
+++ b/lite/kernels/opencl/conv_compute.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/tensor.h"
@@ -57,6 +58,34 @@ class ConvCompute
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
+class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
+                                           PRECISION(kFloat),
+                                           DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConvParam;
+  using kernel_t = void (ConvImageCompute::*)();
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  void Conv2d1x1();
+  void Conv2d3x3();
+  void Conv2d5x5();
+  void Conv2d7x7();
+  void DepthwiseConv2d3x3s1();
+  void DepthwiseConv2d3x3();
+  void DepthwiseConv2d();
+
+  kernel_t impl_;
+  std::vector<std::string> kernel_func_names_{};
+  std::vector<std::string> kernel_func_paths_{};
+  std::vector<std::string> build_options_{};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+  Tensor filter_gpu_image_;
+  Tensor bias_gpu_image_;
+};
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/opencl/conv_compute_test.cc b/lite/kernels/opencl/conv_compute_test.cc
index a7417e3525605e208c8e25cd5d34200e6652053d..af59873336fb154b34d7ada398d7fe8e568e7655 100644
--- a/lite/kernels/opencl/conv_compute_test.cc
+++ b/lite/kernels/opencl/conv_compute_test.cc
@@ -24,7 +24,6 @@ namespace lite {
 #define A(i, j) a[i * lda + j]
 #define B(i, j) cur_b[i * ldb + j]
 #define C(i, j) cur_c[i * ldc + j]
-
 template <typename Dtype1, typename Dtype2>
 static void conv_basic(const Dtype1* din,
                        Dtype2* dout,
@@ -47,7 +46,7 @@ static void conv_basic(const Dtype1* din,
                        int pad_w,
                        int pad_h,
                        bool flag_bias,
-                       bool flag_relu) {
+                       std::string flag_relu) {
   Dtype2 beta = 0;
   auto src_data = din;
   auto dst_data_ref = dout;
@@ -97,10 +96,15 @@ static void conv_basic(const Dtype1* din,
                 }
               }
             }
-            if (flag_relu) {
+            if (flag_relu == "relu") {
               dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
                                           ? dst_data_ref[out_idx]
                                           : (Dtype2)0;
+            } else if (flag_relu == "relu6") {
+              auto dst_tmp = (dst_data_ref[out_idx] > (Dtype2)0)
+                                 ? dst_data_ref[out_idx]
+                                 : (Dtype2)0;
+              dst_data_ref[out_idx] = (dst_tmp < 6.f) ? dst_tmp : 6.f;
             }
           }
         }
@@ -162,6 +166,8 @@ void PrintData(std::string name,
   }
 }
 
+// buffer
+#if 0
 // #define PRINT_RESULT
 #define LOOP_TEST
 TEST(conv2d, compute_conv2d_1x1) {
@@ -187,7 +193,7 @@ TEST(conv2d, compute_conv2d_1x1) {
         /*int iw = ih;*/ for (int iw = 1; iw < 10; iw += 1) {  // iw
           for (int ic = 1; ic < 10; ic += 1) {                 // k
             for (bool bias_flag : {true /*, false*/}) {
-              for (bool relu_flag : {true /*, false*/}) {
+              for (std::string relu_flag : {"relu" /*, "relu6", "None"*/}) {
 #else
   // groups:1 stride_h:1 stride_w:1 pad_h:0 pad_w:0 kernel_h:1 kernel_h:1
   // x_dims:1 32 112 112
@@ -227,10 +233,21 @@ TEST(conv2d, compute_conv2d_1x1) {
                 param.bias = bias_flag ? &bias : nullptr;
                 param.output = &out;
                 param.strides = {stride, stride};
-                param.paddings = {pad, pad};
+                std::vector<int> paddings = {pad, pad, pad, pad};
                 param.groups = group;
-                param.dilations = {dilation, dilation};
-                param.fuse_relu = relu_flag;
+                std::vector<int> dilations = {dilation, dilation};
+                if (relu_flag == "relu") {
+                  param.fuse_relu = true;
+                } else if (relu_flag == "None") {
+                  param.fuse_relu = false;
+                } else if (relu_flag == "relu6") {
+                  param.activation_param.Relu_clipped_coef = 6.f;
+                  param.activation_param.has_active = true;
+                  param.activation_param.active_type =
+                      lite_api::ActivationType::kRelu6;
+                }
+                param.paddings = std::make_shared<std::vector<int>>(paddings);
+                param.dilations = std::make_shared<std::vector<int>>(dilations);
 
                 kernel->SetParam(param);
                 std::unique_ptr<KernelContext> conv_context(new KernelContext);
@@ -389,7 +406,7 @@ TEST(conv2d, compute_conv2d_1x1) {
 #undef PRINT_RESULT
 
 // #define PRINT_RESULT
-#define LOOP_TEST
+// #define LOOP_TEST
 TEST(conv2d, compute_conv2d_gemm) {
   std::unique_ptr<KernelContext> context(new KernelContext);
   context->As<OpenCLContext>().InitOnce();
@@ -410,7 +427,7 @@ TEST(conv2d, compute_conv2d_gemm) {
         for (int iw = 1; iw < 10; iw += 1) {    // iw
           for (int ic = 1; ic < 10; ic += 1) {  // k
             for (bool bias_flag : {true, false}) {
-              for (bool relu_flag : {true, false}) {
+              for (std::string relu_flag : {"relu", "relu6", "None"}) {
 #else
 
                 const int batch_size = 8;
@@ -419,7 +436,8 @@ TEST(conv2d, compute_conv2d_gemm) {
                 const int iw = 224;
                 const int ic = 3;
                 const bool bias_flag = true;
-                const bool relu_flag = true;
+                const std::string relu_flag =
+                    "relu6";  // "relu", "relu6", "None"
 
 #endif
                 const int oh = (ih + 2 * pad - ksize) / stride + 1;
@@ -454,10 +472,22 @@ TEST(conv2d, compute_conv2d_gemm) {
                 param.bias = bias_flag ? &bias : nullptr;
                 param.output = &out;
                 param.strides = {stride, stride};
-                param.paddings = {pad, pad};
+                std::vector<int> paddings = {pad, pad, pad, pad};
                 param.groups = group;
-                param.dilations = {dilation, dilation};
-                param.fuse_relu = relu_flag;
+                std::vector<int> dilations = {dilation, dilation};
+                if (relu_flag == "relu") {
+                  param.fuse_relu = true;
+                } else if (relu_flag == "None") {
+                  param.fuse_relu = false;
+                } else if (relu_flag == "relu6") {
+                  param.activation_param.Relu_clipped_coef = 6.f;
+                  param.activation_param.has_active = true;
+                  param.activation_param.active_type =
+                      lite_api::ActivationType::kRelu6;
+                }
+
+                param.paddings = std::make_shared<std::vector<int>>(paddings);
+                param.dilations = std::make_shared<std::vector<int>>(dilations);
 
                 kernel->SetParam(param);
                 std::unique_ptr<KernelContext> conv_context(new KernelContext);
@@ -595,8 +625,9 @@ TEST(conv2d, compute_conv2d_gemm) {
   }              // batch_size
 #endif
 }
+#endif
 
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/conv_image2d_compute_test.cc b/lite/kernels/opencl/conv_image2d_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c81978b405e3acb4bc0e3ecc44b1ec10ac903b7
--- /dev/null
+++ b/lite/kernels/opencl/conv_image2d_compute_test.cc
@@ -0,0 +1,1488 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+// #define SHADOW_LOG LOG(INFO)
+#define SHADOW_LOG VLOG(4)
+
+template <typename Dtype1, typename Dtype2>
+static void conv_basic(const Dtype1* din,
+                       Dtype2* dout,
+                       int num,
+                       int chout,
+                       int hout,
+                       int wout,
+                       int chin,
+                       int hin,
+                       int win,
+                       const Dtype1* weights,
+                       const Dtype2* bias,
+                       int group,
+                       int kernel_w,
+                       int kernel_h,
+                       int stride_w,
+                       int stride_h,
+                       int dila_w,
+                       int dila_h,
+                       int pad_w,
+                       int pad_h,
+                       bool flag_bias,
+                       std::string flag_relu) {
+  Dtype2 beta = 0;
+  auto src_data = din;
+  auto dst_data_ref = dout;
+  auto weights_data = weights;
+  auto with_bias = flag_bias;
+  auto bias_data = bias;
+
+  int in_num = num;
+  int out_channels = chout;
+  int out_h = hout;
+  int out_w = wout;
+
+  int in_channel = chin;
+  int in_h = hin;
+  int in_w = win;
+  int out_c_group = out_channels / group;
+  int in_c_group = in_channel / group;
+
+  for (int n = 0; n < in_num; ++n) {
+    for (int g = 0; g < group; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < out_h; ++oh) {
+          for (int ow = 0; ow < out_w; ++ow) {
+            int out_idx = n * group * out_c_group * out_h * out_w +
+                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
+                          oh * out_w + ow;
+            Dtype2 bias_d =
+                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
+            dst_data_ref[out_idx] = bias_d;  // + dst_data_ref[out_idx] * beta;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - pad_w + kw * (dila_w);
+                  int ih = oh * stride_h - pad_h + kh * (dila_h);
+                  if (iw < 0 || iw >= in_w) continue;
+                  if (ih < 0 || ih >= in_h) continue;
+
+                  int iidx = n * in_channel * in_h * in_w +
+                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
+                             ih * in_w + iw;
+                  int widx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+
+                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
+                }
+              }
+            }
+            if (flag_relu == "relu") {
+              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
+                                          ? dst_data_ref[out_idx]
+                                          : (Dtype2)0;
+            } else if (flag_relu == "relu6") {
+              auto dst_tmp = (dst_data_ref[out_idx] > (Dtype2)0)
+                                 ? dst_data_ref[out_idx]
+                                 : (Dtype2)0;
+              dst_data_ref[out_idx] = (dst_tmp < 6.f) ? dst_tmp : 6.f;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+int ConvOutputSize(int input_size,
+                   int filter_size,
+                   int dilation,
+                   int pad_left,
+                   int pad_right,
+                   int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size =
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
+
+  return output_size;
+}
+// #define PRINT_RESULT
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_1x1) {
+  // conv infos
+  const int ksize = 1;
+  const int stride = 1;
+  const int pad = 0;
+  const int group = 1;
+  const int dilation = 0;
+//  int loop_cnt = 0;
+
+#ifdef LOOP_TEST
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
+    for (int oc = 4; oc < 10; oc += 1) {   // oc
+      for (int ih = 4; ih < 9; ih += 1) {  // ih
+        int iw = ih;
+        for (int iw = 4; iw < 10; iw += 1) {    // iw
+          for (int ic = 4; ic < 10; ic += 1) {  // ic
+            for (bool bias_flag : {true, false}) {
+              for (std::string relu_flag : {"relu"}) {
+#else
+  const int batch_size = 1;
+  const int oc = 4;
+  const int ih = 8;
+  const int iw = 8;
+  const int ic = 4;
+  const bool bias_flag = true;
+  const std::string relu_flag = "relu";
+#endif
+                const int oh = ih;
+                const int ow = iw;
+
+                SHADOW_LOG << "to get kernel ...";
+                auto kernels =
+                    KernelRegistry::Global().Create("conv2d",
+                                                    TARGET(kOpenCL),
+                                                    PRECISION(kFloat),
+                                                    DATALAYOUT(kImageDefault));
+                ASSERT_FALSE(kernels.empty());
+
+                auto kernel = std::move(kernels.front());
+                SHADOW_LOG << "created conv2d_1x1 kernel";
+
+                SHADOW_LOG << "prepare kernel ------";
+
+                lite::Tensor input, filter, bias, output;
+                operators::ConvParam param;
+                param.x = &input;
+                param.filter = &filter;
+                param.output = &output;
+                if (bias_flag) {
+                  param.bias = &bias;
+                }
+                if (relu_flag == "relu") {
+                  param.fuse_relu = true;
+                } else if (relu_flag == "None") {
+                  param.fuse_relu = false;
+                } else if (relu_flag == "relu6") {
+                  param.activation_param.Relu_clipped_coef = 6.f;
+                  param.activation_param.has_active = true;
+                  param.activation_param.active_type =
+                      lite_api::ActivationType::kRelu6;
+                }
+
+                std::vector<int> paddings = {pad, pad, pad, pad};
+                std::vector<int> dilations = {dilation, dilation};
+
+                param.paddings = std::make_shared<std::vector<int>>(paddings);
+                param.dilations = std::make_shared<std::vector<int>>(dilations);
+                param.strides = std::vector<int>{stride, stride};
+
+                std::unique_ptr<KernelContext> context(new KernelContext);
+                context->As<OpenCLContext>().InitOnce();
+
+                std::unique_ptr<KernelContext> conv_1x1_context(
+                    new KernelContext);
+                context->As<OpenCLContext>().CopySharedTo(
+                    &(conv_1x1_context->As<OpenCLContext>()));
+                kernel->SetContext(std::move(conv_1x1_context));
+
+                const DDim& input_dim =
+                    lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+
+                const DDim& filter_dim =
+                    lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
+                const DDim& out_dim =
+                    lite::DDim{std::vector<int64_t>({batch_size, oc, ih, iw})};
+                // element wise bias
+                const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+
+                param.x->Resize(input_dim);
+                param.filter->Resize(filter_dim);
+                param.output->Resize(out_dim);
+                if (bias_flag) {
+                  param.bias->Resize(bias_dim);
+                }
+
+                kernel->SetParam(param);
+
+                size_t input_image_width = iw * ((ic + 3) / 4);
+                size_t input_image_height = ih * batch_size;
+
+                size_t out_image_width = ow * ((oc + 3) / 4);
+                size_t out_image_height = oh * batch_size;
+
+                size_t bias_image_width = ow * ((oc + 3) / 4);
+                size_t bias_image_height = oh * batch_size;
+
+                size_t filter_image_width = ksize * ((oc + 3) / 4);
+                size_t filter_image_height = ic * ksize;
+
+                const size_t cl_image2d_row_pitch{0};
+                const size_t cl_image2d_slice_pitch{0};
+
+                std::default_random_engine engine;
+                std::uniform_real_distribution<float> gen(-5, 5);
+
+                std::vector<float> input_v(batch_size * ic * ih * iw);
+                std::vector<float> filter_v(oc * ic * ksize * ksize);
+                std::vector<float> output_v(batch_size * oc * ih * iw);
+                std::vector<float> bias_v(oc);
+
+                SHADOW_LOG << "gen input and filter ...";
+
+                for (auto& i : input_v) {
+                  i = gen(engine);
+                }
+                for (auto& f : filter_v) {
+                  f = gen(engine);
+                }
+
+                SHADOW_LOG << "after gen input and filter ...";
+                SHADOW_LOG << "input_v.size(): " << input_v.size();
+                SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+                SHADOW_LOG << "output_v.size(): " << output_v.size();
+                SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+                SHADOW_LOG << "input_dim.production(): "
+                           << input_dim.production();
+                SHADOW_LOG << "filter_dim.production(): "
+                           << filter_dim.production();
+                SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+                SHADOW_LOG << "bias_dim.production(): "
+                           << bias_dim.production();
+                SHADOW_LOG << "4 * input_image_height * input_image_width: "
+                           << 4 * input_image_height * input_image_width;
+                SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                           << 4 * filter_image_width * filter_image_height;
+
+                CHECK(input_dim.production() == input_v.size());
+                CHECK_LE(input_dim.production(),
+                         4 * input_image_height * input_image_width);
+                CHECK(filter_dim.production() == filter_v.size());
+                CHECK_LE(filter_dim.production(),
+                         4 * filter_image_width * filter_image_height);
+
+                paddle::lite::CLImageConverterDefault default_convertor;
+                SHADOW_LOG << "set mapped input  ...";
+                std::vector<float> x_image_v(
+                    input_image_width * input_image_height * 4);  // 4 : RGBA
+                std::vector<float> filter_image_v(
+                    filter_image_width * filter_image_height * 4);  // 4 :RGBA
+                std::vector<float> bias_image_v(
+                    bias_image_width * bias_image_height * 4);  // 4 : RGBA
+                std::vector<float> out_image_v(
+                    out_image_width * out_image_height * 4);  // 4 : RGBA
+
+                default_convertor.NCHWToImage(
+                    input_v.data(), x_image_v.data(), input_dim);
+
+                SHADOW_LOG << "set mapped filter  ...";
+                paddle::lite::CLImageConverterNWBlock nw_convertor;
+                nw_convertor.NCHWToImage(
+                    filter_v.data(), filter_image_v.data(), filter_dim);
+
+                auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                    input_image_width, input_image_height, x_image_v.data());
+                // assign filter as target arm
+                filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                               filter_dim);
+                //                auto* filter_image2d =
+                //                filter.mutable_data<float, cl::Image2D>(
+                //                    filter_image_width,
+                //                    filter_image_height,
+                //                    filter_image_v.data());
+                SHADOW_LOG << "卷积核: ----  ";
+                for (int i = 0; i < filter_v.size(); i++) {
+                  SHADOW_LOG << "(" << i << ")" << filter_v[i];
+                }
+
+                SHADOW_LOG << "卷积核1: ----  ";
+                const float* filter_p = filter.data<float>();
+                for (int i = 0; i < filter_v.size(); i++) {
+                  SHADOW_LOG << "(" << i << ")" << *filter_p;
+                  filter_p++;
+                }
+                SHADOW_LOG << "卷积核2: ----  ";
+                const float* filter_p2 = filter.mutable_data<float>();
+                for (int i = 0; i < filter_v.size(); i++) {
+                  SHADOW_LOG << "(" << i << ")" << *filter_p2;
+                  filter_p2++;
+                }
+                if (bias_flag) {
+                  for (int i = 0; i < bias_dim.production(); ++i) {
+                    bias_v[i] = static_cast<int>(gen(engine));
+                  }
+                  bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                               bias_dim);
+                  //                CLImageConverterFolder folder_convertor;
+                  //                folder_convertor.NCHWToImage(
+                  //                    bias_v.data(), bias_image_v.data(),
+                  //                    bias_dim);
+                  //
+                  //                auto* bias_data = bias.mutable_data<float,
+                  //                cl::Image2D>(
+                  //                    bias_image_width, bias_image_height,
+                  //                    bias_image_v.data());
+                }
+
+                SHADOW_LOG << "resize output  ...";
+                output.Resize(out_dim);
+
+                // cpu conv basic calc
+                lite::Tensor out_ref;
+                out_ref.Resize(out_dim);
+
+                SHADOW_LOG << "prepare kernel ready";
+
+                SHADOW_LOG << "kernel launch ...";
+                kernel->Launch();
+                SHADOW_LOG << "mutable output ...";
+                auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                    out_image_width, out_image_height);
+
+                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+                auto* out_ptr = param.output->data<float, cl::Image2D>();
+                auto it = wait_list->find(out_ptr);
+
+                if (it != wait_list->end()) {
+                  SHADOW_LOG << "--- Find the sync event for the target cl "
+                                "tensor. ---";
+                  auto& event = *(it->second);
+                  event.wait();
+                } else {
+                  LOG(FATAL) << "Could not find the sync event for the target"
+                                "cl tensor.";
+                }
+
+                TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                            output.data<float, cl::Image2D>(),
+                                            out_image_width,
+                                            out_image_height,
+                                            cl_image2d_row_pitch,
+                                            cl_image2d_slice_pitch,
+                                            IoDirection::DtoH);
+
+                DDim out_image_shape =
+                    default_convertor.InitImageDimInfoWith(output.dims());
+
+                default_convertor.ImageToNCHW(out_image_v.data(),
+                                              output_v.data(),
+                                              out_image_shape,
+                                              output.dims());
+                SHADOW_LOG << "mutable_data out_ref_data: ";
+
+                // run cpu ref
+                auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+
+                SHADOW_LOG << " conv_basic beigin ..... ";
+
+                conv_basic<float, float>(input_v.data(),
+                                         out_ref_data,
+                                         batch_size,
+                                         oc,
+                                         oh,
+                                         ow,
+                                         ic,
+                                         ih,
+                                         iw,
+                                         filter_v.data(),
+                                         bias_v.data(),  // mapped_bias,
+                                         group,
+                                         ksize,
+                                         ksize,
+                                         stride,
+                                         stride,
+                                         dilation,
+                                         dilation,
+                                         pad,
+                                         pad,
+                                         bias_flag,
+                                         relu_flag);
+                SHADOW_LOG << " conv_basic end ..... ";
+
+                SHADOW_LOG << " out_dim: " << out_dim;
+                const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                    {static_cast<int64_t>(out_image_width),
+                     static_cast<int64_t>(out_image_height)})};
+
+                for (int i = 0; i < out_dim.production(); i++) {
+                  EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                  if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                    LOG(FATAL) << "error idx:" << i;
+                  }
+                }
+
+#ifdef LOOP_TEST
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
+
+// #define PRINT_RESULT
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_3x3) {
+  // conv infos
+  const int ksize = 3;
+//  int loop_cnt = 0;
+
+#ifdef LOOP_TEST
+  const int pad = 1;
+  const int dilation = 1;
+  const int stride = 2;
+  const int group = 1;
+  for (int batch_size = 1; batch_size < 2; ++batch_size) {
+    for (int oc = 1; oc < 10; oc += 1) {   // oc
+      for (int ih = 5; ih < 9; ih += 1) {  // ih
+        int iw = ih;
+        for (int ic = 1; ic < 10; ic += 1) {  // ic
+          for (bool bias_flag : {true, false}) {
+            for (std::string relu_flag : {/*true,*/ "relu"}) {
+#else
+                const int pad = 1;
+                const int dilation = 1;
+
+#if 0  // small scale with group, but result of cpu reference is wrong
+                const int stride = 2;
+                const int group = 2;
+                const int batch_size = 1;
+                const int ic = 1;
+                const int ih = 3;
+                const int iw = 3;
+                const int oc = 2;
+#else  // big scale with group
+                const int stride = 1;
+                const int group = 32;
+                const int batch_size = 1;
+                const int ic = 32;
+                const int ih = 112;
+                const int iw = 112;
+                const int oc = 32;
+#endif
+
+                const bool bias_flag = false;
+                const std::string relu_flag = "relu";
+#endif
+              int filter_channel = ic;
+              if (group > 1) {
+                filter_channel = 1;
+              }
+
+              const int oh =
+                  ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
+              const int ow =
+                  ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
+              SHADOW_LOG << "to get kernel ...";
+              auto kernels =
+                  KernelRegistry::Global().Create("conv2d",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(kernels.empty());
+              CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1";
+
+              auto kernel = std::move(kernels.front());
+              SHADOW_LOG << "created conv2d kernel";
+
+              SHADOW_LOG << "prepare kernel ------";
+
+              lite::Tensor input, filter, bias, output;
+              operators::ConvParam param;
+              param.x = &input;
+              param.filter = &filter;
+              param.output = &output;
+              param.groups = group;
+              if (bias_flag) {
+                param.bias = &bias;
+              }
+              if (relu_flag == "relu") {
+                param.fuse_relu = true;
+              } else if (relu_flag == "None") {
+                param.fuse_relu = false;
+              } else if (relu_flag == "relu6") {
+                param.activation_param.Relu_clipped_coef = 6.f;
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu6;
+              }
+
+              std::vector<int> paddings = {pad, pad, pad, pad};
+              std::vector<int> dilations = {dilation, dilation};
+
+              param.paddings = std::make_shared<std::vector<int>>(paddings);
+              param.dilations = std::make_shared<std::vector<int>>(dilations);
+              param.strides = std::vector<int>{stride, stride};
+
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+
+              std::unique_ptr<KernelContext> conv_1x1_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(conv_1x1_context->As<OpenCLContext>()));
+              kernel->SetContext(std::move(conv_1x1_context));
+
+              const DDim& input_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+
+              const DDim& filter_dim = lite::DDim{
+                  std::vector<int64_t>({oc, filter_channel, ksize, ksize})};
+              const DDim& out_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+              // element wise bias
+              const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+
+              VLOG(2) << "input_dim:" << input_dim
+                      << " filter_dim:" << filter_dim << " out_dim:" << out_dim
+                      << " bias_flag:" << bias_flag << " bias_dim:" << bias_dim
+                      << " group:" << group << " stride:" << stride
+                      << " pad:" << pad << " dilation:" << dilation;
+
+              param.x->Resize(input_dim);
+              param.filter->Resize(filter_dim);
+              param.output->Resize(out_dim);
+              if (bias_flag) {
+                param.bias->Resize(bias_dim);
+              }
+
+              kernel->SetParam(param);
+
+              size_t input_image_width = iw * ((ic + 3) / 4);
+              size_t input_image_height = ih * batch_size;
+
+              size_t out_image_width = ow * ((oc + 3) / 4);
+              size_t out_image_height = oh * batch_size;
+
+              size_t bias_image_width = ow * ((oc + 3) / 4);
+              size_t bias_image_height = oh * batch_size;
+
+              size_t filter_image_width = ksize * ((filter_channel + 3) / 4);
+              size_t filter_image_height = oc * ksize;
+
+              const size_t cl_image2d_row_pitch{0};
+              const size_t cl_image2d_slice_pitch{0};
+
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> gen(-5, 5);
+
+              std::vector<float> input_v(batch_size * ic * ih * iw);
+              std::vector<float> filter_v(oc * filter_channel * ksize * ksize);
+              std::vector<float> output_v(batch_size * oc * oh * ow);
+              std::vector<float> bias_v(oc);
+
+              SHADOW_LOG << "gen input and filter ...";
+              for (int i = 0; i < input_v.size(); ++i) {
+                input_v[i] = i;  // gen(engine);
+              }
+              for (int i = 0; i < filter_v.size(); ++i) {
+                filter_v[i] = 1;  // gen(engine);
+              }
+
+              SHADOW_LOG << "after gen input and filter ...";
+              SHADOW_LOG << "input_v.size(): " << input_v.size();
+              SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+              SHADOW_LOG << "output_v.size(): " << output_v.size();
+              SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+              SHADOW_LOG << "input_dim.production(): "
+                         << input_dim.production();
+              SHADOW_LOG << "filter_dim.production(): "
+                         << filter_dim.production();
+              SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+              SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
+              SHADOW_LOG << "input_image_height:" << input_image_height
+                         << " input_image_width:" << input_image_width;
+              SHADOW_LOG << "filter_image_height:" << filter_image_height
+                         << " filter_image_width:" << filter_image_width;
+              SHADOW_LOG << "4 * input_image_height *input_image_width: "
+                         << 4 * input_image_height * input_image_width;
+              SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                         << 4 * filter_image_width * filter_image_height;
+
+              CHECK(input_dim.production() == input_v.size());
+              CHECK_LE(input_dim.production(),
+                       4 * input_image_height * input_image_width);
+              CHECK(filter_dim.production() == filter_v.size());
+              CHECK_LE(filter_dim.production(),
+                       4 * filter_image_width * filter_image_height);
+
+              paddle::lite::CLImageConverterDefault default_convertor;
+              SHADOW_LOG << "set mapped input  ...";
+              std::vector<float> x_image_v(input_image_width *
+                                           input_image_height * 4);  // 4 :RGBA
+              std::vector<float> filter_image_v(
+                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
+              std::vector<float> bias_image_v(
+                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
+              std::vector<float> out_image_v(out_image_width *
+                                             out_image_height * 4);  // 4 :RGBA
+
+              default_convertor.NCHWToImage(
+                  input_v.data(), x_image_v.data(), input_dim);
+              SHADOW_LOG << "输入: ----  ";
+              for (int i = 0; i < input_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << input_v[i];
+              }
+              SHADOW_LOG << "输入image : ----  ";
+              for (int i = 0; i < x_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << x_image_v[i];
+              }
+              SHADOW_LOG << "set mapped filter  ...";
+              CLImageConverterFolder folder_convertor;
+
+              folder_convertor.NCHWToImage(
+                  filter_v.data(), filter_image_v.data(), filter_dim);
+              SHADOW_LOG << "卷积核: ----  ";
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_v[i];
+              }
+              SHADOW_LOG << "卷积核image: ----  ";
+              for (int i = 0; i < filter_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
+              }
+              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                  input_image_width, input_image_height, x_image_v.data());
+              // assign filter as target arm
+              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                             filter_dim);
+              // filter kernel
+              //              auto* filter_image2d = filter.mutable_data<float,
+              //              cl::Image2D>(
+              //                  filter_image_width,
+              //                  filter_image_height,
+              //                  filter_image_v.data());
+
+              if (bias_flag) {
+                for (int i = 0; i < bias_dim.production(); ++i) {
+                  bias_v[i] = static_cast<int>(gen(engine));
+                }
+                bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                             bias_dim);
+                //                CLImageConverterFolder folder_convertor;
+                //                folder_convertor.NCHWToImage(
+                //                    bias_v.data(), bias_image_v.data(),
+                //                    bias_dim);
+                //
+                //                auto* bias_data = bias.mutable_data<float,
+                //                cl::Image2D>(
+                //                    bias_image_width, bias_image_height,
+                //                    bias_image_v.data());
+              }
+
+              SHADOW_LOG << "resize output  ...";
+              output.Resize(out_dim);
+
+              // cpu conv basic calc
+              lite::Tensor out_ref;
+              out_ref.Resize(out_dim);
+
+              SHADOW_LOG << "prepare kernel ready";
+
+              SHADOW_LOG << "kernel launch ...";
+              kernel->Launch();
+              SHADOW_LOG << "mutable output ...";
+              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                  out_image_width, out_image_height);
+
+              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto it = wait_list->find(out_ptr);
+
+              if (it != wait_list->end()) {
+                SHADOW_LOG << "--- Find the sync event for the target cl "
+                              "tensor. ---";
+                auto& event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target "
+                              "cl tensor.";
+              }
+
+              TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                          output.data<float, cl::Image2D>(),
+                                          out_image_width,
+                                          out_image_height,
+                                          cl_image2d_row_pitch,
+                                          cl_image2d_slice_pitch,
+                                          IoDirection::DtoH);
+
+              DDim out_image_shape =
+                  default_convertor.InitImageDimInfoWith(output.dims());
+
+              default_convertor.ImageToNCHW(out_image_v.data(),
+                                            output_v.data(),
+                                            out_image_shape,
+                                            output.dims());
+
+              SHADOW_LOG << "输出: ----  ";
+              for (int i = 0; i < output_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << output_v[i];
+              }
+
+              SHADOW_LOG << "输出image: ----  ";
+              for (int i = 0; i < out_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << out_image_v[i];
+              }
+              SHADOW_LOG << "mutable_data out_ref_data: ";
+
+              // run cpu ref
+              auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+
+              SHADOW_LOG << " conv_basic beigin ..... ";
+
+              conv_basic<float, float>(input_v.data(),
+                                       out_ref_data,
+                                       batch_size,
+                                       oc,
+                                       oh,
+                                       ow,
+                                       ic,
+                                       ih,
+                                       iw,
+                                       filter_v.data(),
+                                       bias_v.data(),  // mapped_bias,
+                                       group,
+                                       ksize,
+                                       ksize,
+                                       stride,
+                                       stride,
+                                       dilation,
+                                       dilation,
+                                       pad,
+                                       pad,
+                                       bias_flag,
+                                       relu_flag);
+              SHADOW_LOG << " conv_basic end ..... ";
+
+              SHADOW_LOG << " out_dim: " << out_dim;
+              const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                  {static_cast<int64_t>(out_image_width),
+                   static_cast<int64_t>(out_image_height)})};
+
+#ifdef PRINT_RESULT
+              for (int i = 0; i < out_dim.production(); i++) {
+                VLOG(4) << "output_v[" << i << "]:" << output_v[i]
+                        << " out_ref_data[" << i << "]:" << out_ref_data[i];
+              }
+#endif
+
+              for (int i = 0; i < out_dim.production(); i++) {
+                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                  LOG(FATAL) << "error idx:" << i;
+                }
+              }
+
+#ifdef LOOP_TEST
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
+
+// #define PRINT_RESULT
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_5x5) {
+  // conv infos
+  const int ksize = 5;
+  const int stride = 1;
+  const int pad = 2;
+  const int group = 1;
+  const int dilation = 1;
+//  int loop_cnt = 0;
+
+#ifdef LOOP_TEST
+  for (int batch_size = 2; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 10; oc += 1) {   // oc
+      for (int ih = 5; ih < 9; ih += 1) {  // ih
+        int iw = ih;
+        for (int ic = 1; ic < 10; ic += 1) {  // ic
+          for (bool bias_flag : {true, false}) {
+            for (std::string relu_flag : {/*true,*/ "relu"}) {
+#else
+                const int batch_size = 2;
+                const int oc = 1;
+                const int ih = 5;
+                const int iw = 5;
+                const int ic = 1;
+                const bool bias_flag = true;
+                const std::string relu_flag = "relu";
+#endif
+
+              const int oh =
+                  ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
+              const int ow =
+                  ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
+              SHADOW_LOG << "to get kernel ...";
+              auto kernels =
+                  KernelRegistry::Global().Create("conv2d",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(kernels.empty());
+
+              auto kernel = std::move(kernels.front());
+              SHADOW_LOG << "created conv2d kernel";
+
+              SHADOW_LOG << "prepare kernel ------";
+
+              lite::Tensor input, filter, bias, output;
+              operators::ConvParam param;
+              param.x = &input;
+              param.filter = &filter;
+              param.output = &output;
+              if (bias_flag) {
+                param.bias = &bias;
+              }
+              if (relu_flag == "relu") {
+                param.fuse_relu = true;
+              } else if (relu_flag == "None") {
+                param.fuse_relu = false;
+              } else if (relu_flag == "relu6") {
+                param.activation_param.Relu_clipped_coef = 6.f;
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu6;
+              }
+
+              std::vector<int> paddings = {pad, pad, pad, pad};
+              std::vector<int> dilations = {dilation, dilation};
+
+              param.paddings = std::make_shared<std::vector<int>>(paddings);
+              param.dilations = std::make_shared<std::vector<int>>(dilations);
+              param.strides = std::vector<int>{stride, stride};
+
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+
+              std::unique_ptr<KernelContext> conv_1x1_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(conv_1x1_context->As<OpenCLContext>()));
+              kernel->SetContext(std::move(conv_1x1_context));
+
+              const DDim& input_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+
+              const DDim& filter_dim =
+                  lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
+              const DDim& out_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+              // element wise bias
+              const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+
+              VLOG(2) << "input_dim:" << input_dim
+                      << " filter_dim:" << filter_dim << " out_dim:" << out_dim
+                      << " bias_flag:" << bias_flag << " bias_dim:" << bias_dim
+                      << " group:" << group << " stride:" << stride
+                      << " pad:" << pad << " dilation:" << dilation;
+
+              param.x->Resize(input_dim);
+              param.filter->Resize(filter_dim);
+              param.output->Resize(out_dim);
+              if (bias_flag) {
+                param.bias->Resize(bias_dim);
+              }
+
+              kernel->SetParam(param);
+
+              size_t input_image_width = iw * ((ic + 3) / 4);
+              size_t input_image_height = ih * batch_size;
+
+              size_t out_image_width = ow * ((oc + 3) / 4);
+              size_t out_image_height = oh * batch_size;
+
+              size_t bias_image_width = ow * ((oc + 3) / 4);
+              size_t bias_image_height = oh * batch_size;
+
+              size_t filter_image_width = ksize * ((ic + 3) / 4);
+              size_t filter_image_height = oc * ksize;
+
+              const size_t cl_image2d_row_pitch{0};
+              const size_t cl_image2d_slice_pitch{0};
+
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> gen(-5, 5);
+
+              std::vector<float> input_v(batch_size * ic * ih * iw);
+              std::vector<float> filter_v(oc * ic * ksize * ksize);
+              std::vector<float> output_v(batch_size * oc * oh * ow);
+              std::vector<float> bias_v(oc);
+
+              SHADOW_LOG << "gen input and filter ...";
+              for (auto& i : input_v) {
+                i = gen(engine);
+              }
+              for (auto& f : filter_v) {
+                f = gen(engine);
+              }
+
+              SHADOW_LOG << "after gen input and filter ...";
+              SHADOW_LOG << "input_v.size(): " << input_v.size();
+              SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+              SHADOW_LOG << "output_v.size(): " << output_v.size();
+              SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+              SHADOW_LOG << "input_dim.production(): "
+                         << input_dim.production();
+              SHADOW_LOG << "filter_dim.production(): "
+                         << filter_dim.production();
+              SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+              SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
+              SHADOW_LOG << "4 * input_image_height *input_image_width: "
+                         << 4 * input_image_height * input_image_width;
+              SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                         << 4 * filter_image_width * filter_image_height;
+
+              CHECK(input_dim.production() == input_v.size());
+              CHECK_LE(input_dim.production(),
+                       4 * input_image_height * input_image_width);
+              CHECK(filter_dim.production() == filter_v.size());
+              CHECK_LE(filter_dim.production(),
+                       4 * filter_image_width * filter_image_height);
+
+              paddle::lite::CLImageConverterDefault default_convertor;
+              SHADOW_LOG << "set mapped input  ...";
+              std::vector<float> x_image_v(input_image_width *
+                                           input_image_height * 4);  // 4 :RGBA
+              std::vector<float> filter_image_v(
+                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
+              std::vector<float> bias_image_v(
+                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
+              std::vector<float> out_image_v(out_image_width *
+                                             out_image_height * 4);  // 4 :RGBA
+
+              default_convertor.NCHWToImage(
+                  input_v.data(), x_image_v.data(), input_dim);
+              SHADOW_LOG << "输入: ----  ";
+              for (int i = 0; i < input_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << input_v[i];
+              }
+              SHADOW_LOG << "输入image : ----  ";
+              for (int i = 0; i < x_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << x_image_v[i];
+              }
+              SHADOW_LOG << "set mapped filter  ...";
+              CLImageConverterFolder folder_convertor;
+
+              folder_convertor.NCHWToImage(
+                  filter_v.data(), filter_image_v.data(), filter_dim);
+              SHADOW_LOG << "卷积核: ----  ";
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_v[i];
+              }
+              SHADOW_LOG << "卷积核image: ----  ";
+              for (int i = 0; i < filter_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
+              }
+              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                  input_image_width, input_image_height, x_image_v.data());
+              // assign filter as target arm
+              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                             filter_dim);
+              // filter kernel
+              //              auto* filter_image2d = filter.mutable_data<float,
+              //              cl::Image2D>(
+              //                  filter_image_width,
+              //                  filter_image_height,
+              //                  filter_image_v.data());
+
+              if (bias_flag) {
+                for (int i = 0; i < bias_dim.production(); ++i) {
+                  bias_v[i] = static_cast<int>(gen(engine));
+                }
+                bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                             bias_dim);
+                //                CLImageConverterFolder folder_convertor;
+                //                folder_convertor.NCHWToImage(
+                //                    bias_v.data(), bias_image_v.data(),
+                //                    bias_dim);
+                //
+                //                auto* bias_data = bias.mutable_data<float,
+                //                cl::Image2D>(
+                //                    bias_image_width, bias_image_height,
+                //                    bias_image_v.data());
+              }
+
+              SHADOW_LOG << "resize output  ...";
+              output.Resize(out_dim);
+
+              // cpu conv basic calc
+              lite::Tensor out_ref;
+              out_ref.Resize(out_dim);
+
+              SHADOW_LOG << "prepare kernel ready";
+
+              SHADOW_LOG << "kernel launch ...";
+              kernel->Launch();
+              SHADOW_LOG << "mutable output ...";
+              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                  out_image_width, out_image_height);
+
+              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto it = wait_list->find(out_ptr);
+
+              if (it != wait_list->end()) {
+                SHADOW_LOG << "--- Find the sync event for the target cl "
+                              "tensor. ---";
+                auto& event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target "
+                              "cl tensor.";
+              }
+
+              TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                          output.data<float, cl::Image2D>(),
+                                          out_image_width,
+                                          out_image_height,
+                                          cl_image2d_row_pitch,
+                                          cl_image2d_slice_pitch,
+                                          IoDirection::DtoH);
+
+              DDim out_image_shape =
+                  default_convertor.InitImageDimInfoWith(output.dims());
+
+              default_convertor.ImageToNCHW(out_image_v.data(),
+                                            output_v.data(),
+                                            out_image_shape,
+                                            output.dims());
+
+              SHADOW_LOG << "输出: ----  ";
+              for (int i = 0; i < output_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << output_v[i];
+              }
+
+              SHADOW_LOG << "输出image: ----  ";
+              for (int i = 0; i < out_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << out_image_v[i];
+              }
+              SHADOW_LOG << "mutable_data out_ref_data: ";
+
+              // run cpu ref
+              auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+
+              SHADOW_LOG << " conv_basic beigin ..... ";
+
+              conv_basic<float, float>(input_v.data(),
+                                       out_ref_data,
+                                       batch_size,
+                                       oc,
+                                       oh,
+                                       ow,
+                                       ic,
+                                       ih,
+                                       iw,
+                                       filter_v.data(),
+                                       bias_v.data(),  // mapped_bias,
+                                       group,
+                                       ksize,
+                                       ksize,
+                                       stride,
+                                       stride,
+                                       dilation,
+                                       dilation,
+                                       pad,
+                                       pad,
+                                       bias_flag,
+                                       relu_flag);
+              SHADOW_LOG << " conv_basic end ..... ";
+
+              SHADOW_LOG << " out_dim: " << out_dim;
+              const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                  {static_cast<int64_t>(out_image_width),
+                   static_cast<int64_t>(out_image_height)})};
+
+              for (int i = 0; i < out_dim.production(); i++) {
+                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                  LOG(FATAL) << "error idx:" << i;
+                }
+              }
+
+#ifdef LOOP_TEST
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
+
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_7x7) {
+  // conv infos
+  const int ksize = 7;
+  const int stride = 1;
+  const int pad = 2;
+  const int group = 1;
+  const int dilation = 1;
+//  int loop_cnt = 0;
+
+#ifdef LOOP_TEST
+  for (int batch_size = 2; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 10; oc += 1) {    // oc
+      for (int ih = 7; ih < 15; ih += 1) {  // ih
+        int iw = ih;
+        for (int ic = 1; ic < 10; ic += 1) {  // ic
+          for (bool bias_flag : {true, false}) {
+            for (std::string relu_flag : {"relu"}) {
+#else
+                const int batch_size = 2;
+                const int oc = 1;
+                const int ih = 7;
+                const int iw = 7;
+                const int ic = 1;
+                const bool bias_flag = false;
+                const std::string relu_flag = "";
+#endif
+
+              const int oh =
+                  ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
+              const int ow =
+                  ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
+              SHADOW_LOG << "to get kernel ...";
+              auto kernels =
+                  KernelRegistry::Global().Create("conv2d",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(kernels.empty());
+
+              auto kernel = std::move(kernels.front());
+              SHADOW_LOG << "created conv2d kernel";
+
+              SHADOW_LOG << "prepare kernel ------";
+
+              lite::Tensor input, filter, bias, output;
+              operators::ConvParam param;
+              param.x = &input;
+              param.filter = &filter;
+              param.output = &output;
+              if (bias_flag) {
+                param.bias = &bias;
+              }
+              if (relu_flag == "relu") {
+                param.fuse_relu = true;
+              } else if (relu_flag == "None") {
+                param.fuse_relu = false;
+              } else if (relu_flag == "relu6") {
+                param.activation_param.Relu_clipped_coef = 6.f;
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu6;
+              }
+              std::vector<int> paddings = {pad, pad, pad, pad};
+              std::vector<int> dilations = {dilation, dilation};
+
+              param.paddings = std::make_shared<std::vector<int>>(paddings);
+              param.dilations = std::make_shared<std::vector<int>>(dilations);
+              param.strides = std::vector<int>{stride, stride};
+
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+
+              std::unique_ptr<KernelContext> conv_1x1_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(conv_1x1_context->As<OpenCLContext>()));
+              kernel->SetContext(std::move(conv_1x1_context));
+
+              const DDim& input_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+
+              const DDim& filter_dim =
+                  lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
+              const DDim& out_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+              // element wise bias
+              const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+
+              param.x->Resize(input_dim);
+              param.filter->Resize(filter_dim);
+              param.output->Resize(out_dim);
+              if (bias_flag) {
+                param.bias->Resize(bias_dim);
+              }
+
+              kernel->SetParam(param);
+
+              size_t input_image_width = iw * ((ic + 3) / 4);
+              size_t input_image_height = ih * batch_size;
+
+              size_t out_image_width = ow * ((oc + 3) / 4);
+              size_t out_image_height = oh * batch_size;
+
+              size_t bias_image_width = ow * ((oc + 3) / 4);
+              size_t bias_image_height = oh * batch_size;
+
+              size_t filter_image_width = ksize * ((ic + 3) / 4);
+              size_t filter_image_height = oc * ksize;
+
+              const size_t cl_image2d_row_pitch{0};
+              const size_t cl_image2d_slice_pitch{0};
+
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> gen(-5, 5);
+
+              std::vector<float> input_v(batch_size * ic * ih * iw);
+              std::vector<float> filter_v(oc * ic * ksize * ksize);
+              std::vector<float> output_v(batch_size * oc * oh * ow);
+              std::vector<float> bias_v(oc);
+
+              SHADOW_LOG << "gen input and filter ...";
+              for (auto& i : input_v) {
+                i = gen(engine);
+                //                i = 1;
+              }
+              for (auto& f : filter_v) {
+                f = gen(engine);
+                //                f = 1;
+              }
+              LOG(INFO) << "bias: " << bias_flag;
+              LOG(INFO) << "relu: " << relu_flag;
+
+              LOG(INFO) << "inputdims : " << input_dim;
+              LOG(INFO) << "filterdims: " << filter.dims();
+              LOG(INFO) << "outputdims : " << output.dims();
+              SHADOW_LOG << "after gen input and filter ...";
+              SHADOW_LOG << "input_v.size(): " << input_v.size();
+              SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+              SHADOW_LOG << "output_v.size(): " << output_v.size();
+              SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+              SHADOW_LOG << "input_dim.production(): "
+                         << input_dim.production();
+              SHADOW_LOG << "filter_dim.production(): "
+                         << filter_dim.production();
+              SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+              SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
+              SHADOW_LOG << "4 * input_image_height * input_image_width: "
+                         << 4 * input_image_height * input_image_width;
+              SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                         << 4 * filter_image_width * filter_image_height;
+
+              CHECK(input_dim.production() == input_v.size());
+              CHECK_LE(input_dim.production(),
+                       4 * input_image_height * input_image_width);
+              CHECK(filter_dim.production() == filter_v.size());
+              CHECK_LE(filter_dim.production(),
+                       4 * filter_image_width * filter_image_height);
+
+              paddle::lite::CLImageConverterDefault default_convertor;
+              SHADOW_LOG << "set mapped input  ...";
+              std::vector<float> x_image_v(input_image_width *
+                                           input_image_height * 4);  // 4 : RGBA
+              std::vector<float> filter_image_v(
+                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
+              std::vector<float> bias_image_v(
+                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
+              std::vector<float> out_image_v(out_image_width *
+                                             out_image_height * 4);  // 4 : RGBA
+
+              default_convertor.NCHWToImage(
+                  input_v.data(), x_image_v.data(), input_dim);
+              SHADOW_LOG << "输入: ----  ";
+              for (int i = 0; i < input_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << input_v[i];
+              }
+              SHADOW_LOG << "输入image : ----  ";
+              for (int i = 0; i < x_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << x_image_v[i];
+              }
+              SHADOW_LOG << "set mapped filter  ...";
+              CLImageConverterFolder folder_convertor;
+
+              folder_convertor.NCHWToImage(
+                  filter_v.data(), filter_image_v.data(), filter_dim);
+              SHADOW_LOG << "卷积核: ----  ";
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_v[i];
+              }
+              SHADOW_LOG << "卷积核image: ----  ";
+              for (int i = 0; i < filter_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
+              }
+              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                  input_image_width, input_image_height, x_image_v.data());
+
+              // assign filter as target arm
+              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                             filter_dim);
+
+              //              auto* filter_image2d = filter.mutable_data<float,
+              //              cl::Image2D>(
+              //                  filter_image_width,
+              //                  filter_image_height,
+              //                  filter_image_v.data());
+
+              if (bias_flag) {
+                for (int i = 0; i < bias_dim.production(); ++i) {
+                  bias_v[i] = static_cast<int>(gen(engine));
+                }
+                bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                             bias_dim);
+                //                CLImageConverterFolder folder_convertor;
+                //                folder_convertor.NCHWToImage(
+                //                    bias_v.data(), bias_image_v.data(),
+                //                    bias_dim);
+                //
+                //                auto* bias_data = bias.mutable_data<float,
+                //                cl::Image2D>(
+                //                    bias_image_width, bias_image_height,
+                //                    bias_image_v.data());
+              }
+
+              SHADOW_LOG << "resize output  ...";
+              output.Resize(out_dim);
+
+              // cpu conv basic calc
+              lite::Tensor out_ref;
+              out_ref.Resize(out_dim);
+
+              SHADOW_LOG << "prepare kernel ready";
+
+              SHADOW_LOG << "kernel launch ...";
+              kernel->Launch();
+              SHADOW_LOG << "mutable output ...";
+              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                  out_image_width, out_image_height);
+
+              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto it = wait_list->find(out_ptr);
+
+              if (it != wait_list->end()) {
+                SHADOW_LOG << "--- Find the sync event for the target cl "
+                              "tensor. ---";
+                auto& event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target "
+                              "cl tensor.";
+              }
+
+              TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                          output.data<float, cl::Image2D>(),
+                                          out_image_width,
+                                          out_image_height,
+                                          cl_image2d_row_pitch,
+                                          cl_image2d_slice_pitch,
+                                          IoDirection::DtoH);
+
+              DDim out_image_shape =
+                  default_convertor.InitImageDimInfoWith(output.dims());
+
+              default_convertor.ImageToNCHW(out_image_v.data(),
+                                            output_v.data(),
+                                            out_image_shape,
+                                            output.dims());
+
+              SHADOW_LOG << "输出: ----  ";
+              for (int i = 0; i < output_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << output_v[i];
+              }
+
+              SHADOW_LOG << "输出image: ----  ";
+              for (int i = 0; i < out_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << out_image_v[i];
+              }
+              SHADOW_LOG << "mutable_data out_ref_data: ";
+
+              // run cpu ref
+              auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+
+              SHADOW_LOG << " conv_basic beigin ..... ";
+
+              conv_basic<float, float>(input_v.data(),
+                                       out_ref_data,
+                                       batch_size,
+                                       oc,
+                                       oh,
+                                       ow,
+                                       ic,
+                                       ih,
+                                       iw,
+                                       filter_v.data(),
+                                       bias_v.data(),  // mapped_bias,
+                                       group,
+                                       ksize,
+                                       ksize,
+                                       stride,
+                                       stride,
+                                       dilation,
+                                       dilation,
+                                       pad,
+                                       pad,
+                                       bias_flag,
+                                       relu_flag);
+              SHADOW_LOG << " conv_basic end ..... ";
+
+              SHADOW_LOG << " out_dim: " << out_dim;
+              const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                  {static_cast<int64_t>(out_image_width),
+                   static_cast<int64_t>(out_image_height)})};
+
+              for (int i = 0; i < out_dim.production(); i++) {
+                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                  LOG(FATAL) << "error idx:" << i;
+                }
+              }
+
+#ifdef LOOP_TEST
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
+#undef SHADOW_LOG
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute.cc b/lite/kernels/opencl/depthwise_conv2d_compute.cc
index 62734610e280c89f9df2e367fd7251c7d25756e7..0c88509926041411eddac66bea08b5d3a08d6a3c 100644
--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include <vector>
+
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 
@@ -29,10 +31,17 @@ class DepthwiseConv2dCompute
  public:
   using param_t = operators::ConvParam;
 
+  std::string doc() const override {
+    return "DepthwiseConv2d using cl::Buffer, kFloat";
+  }
+
   void PrepareForRun() override {
     const auto& param = *param_.get_mutable<param_t>();
     if (param.fuse_relu) {
       build_options_ += " -DRELU";
+    } else if (param.activation_param.active_type ==
+               lite_api::ActivationType::kRelu6) {
+      build_options_ += " -DRELU6";
     }
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
@@ -44,7 +53,7 @@ class DepthwiseConv2dCompute
     auto x_dims = param.x->dims();
     auto filter_dims = param.filter->dims();
     auto output_dims = param.output->dims();
-    auto paddings = param.paddings;
+    auto paddings = *param.paddings;
     auto strides = param.strides;
 
     auto& context = ctx_->As<OpenCLContext>();
@@ -110,7 +119,7 @@ class DepthwiseConv2dCompute
 
  private:
   std::string kernel_func_name_{"depthwise_conv2d"};
-  std::string build_options_{"-DCL_DTYPE=float"};
+  std::string build_options_{"-DCL_DTYPE_float"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
index a189acaf919e605b4810770e7136d00baeea4bfa..40cfdfffab452a004d45d804f62309dc71e0b0d9 100644
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
@@ -14,6 +14,7 @@
 
 #include <gtest/gtest.h>
 #include <random>
+#include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
@@ -89,7 +90,7 @@ void depth_conv(const T* input_data,
   }
 }
 
-TEST(depthwise_conv2d, compute) {
+TEST(depthwise_conv2d_buffer_fp32, compute) {
   LOG(INFO) << "to get kernel ...";
   auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
                                                  TARGET(kOpenCL),
@@ -105,7 +106,8 @@ TEST(depthwise_conv2d, compute) {
   param.x = &input;
   param.filter = &filter;
   param.output = &output;
-  param.paddings = std::vector<int>{0, 0};
+  std::vector<int> paddings = {0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
   param.strides = std::vector<int>{1, 1};
 
   std::unique_ptr<KernelContext> context(new KernelContext);
diff --git a/lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b96ffe0502c3e2d654f88e9c9ac35d20704ca01
--- /dev/null
+++ b/lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc
@@ -0,0 +1,515 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <random>
+
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename T, int STRIDE_H = 1, int STRIDE_W = 1>
+void depth_conv(const T* input_data,
+                const lite::DDim& input_dims,
+                const T* filter_data,
+                const lite::DDim& filter_dims,
+                T* output_data,
+                const lite::DDim& output_dims) {
+  int stride_h = STRIDE_H, stride_w = STRIDE_W;
+
+  int64_t batches = input_dims[0];
+  int64_t channels = input_dims[1];
+  int64_t h = input_dims[2];
+  int64_t w = input_dims[3];
+
+  int64_t num_output = output_dims[1];
+  int64_t outh = output_dims[2];
+  int64_t outw = output_dims[3];
+
+  int64_t filter_h = filter_dims[2];
+  int64_t filter_w = filter_dims[3];
+
+  const int64_t in_batch_size = channels * h * w;
+  const int64_t out_batch_size = num_output * outh * outw;
+
+  auto kernel_offset = std::unique_ptr<int[]>(new int[filter_h * filter_w]);
+  {
+    int p = 0;
+    int offset = 0;
+    int gap = w - filter_w;
+    for (int i = 0; i < filter_h; i++) {
+      for (int j = 0; j < filter_w; j++) {
+        kernel_offset[p++] = offset;
+        offset += 1;
+      }
+      offset += gap;
+    }
+  }
+
+  for (int b = 0; b < batches; b++) {
+    auto* input_batch_start = input_data + b * in_batch_size;
+    auto* output_batch_start = output_data + b * out_batch_size;
+    for (int p = 0; p < num_output; p++) {
+      float* output_ptr = output_batch_start + p * outh * outw;
+      const float* filter_ptr = filter_data + p * filter_h * filter_w;
+      const float* input_ptr = input_batch_start + p * h * w;
+
+      for (int i = 0; i < outh; i++) {
+        for (int j = 0; j < outw; j++) {
+          float sum = 0;
+          const float* input_ch_start =
+              input_ptr + i * stride_h * w + j * stride_w;
+
+          for (int fh = 0; fh < filter_h; ++fh) {
+            for (int fw = 0; fw < filter_w; ++fw) {
+              float val = input_ch_start[kernel_offset[fh * filter_w + fw]];
+              float w = filter_ptr[fh * filter_w + fw];
+              sum += val * w;
+            }
+          }
+          output_ptr[j] = sum;
+        }
+
+        output_ptr += outw;
+      }
+    }
+  }
+}
+int ConvOutputSize(int input_size,
+                   int filter_size,
+                   int dilation,
+                   int pad_left,
+                   int pad_right,
+                   int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size =
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
+
+  return output_size;
+}
+
+TEST(depthwise_conv2d_basic, compute) {
+  // conv infos
+  //  const int ksize = 1;
+  const int stride = 1;
+  const int pad = 0;
+  const int group = 1;
+  const int dilation = 1;
+  const int fc = 1;
+  const int batch_size = 1;
+  const int bias_flag = false;
+  const bool relu_flag = false;
+
+//  int loop_cnt = 0;
+
+#ifdef LOOP_TEST
+  // for (int batch_size = 1; batch_size < 2; ++batch_size) {
+  for (int oc = 4; oc < 10; oc += 1) {         // oc = ic
+    for (int fw = 3; fw < 10; fw += 2) {       // fh = fw
+      for (int ih = fw; ih < 15; ih += 1) {    // ih
+        for (int iw = fw; iw < 15; iw += 1) {  // iw
+#else
+  const int oc = 32;
+  const int ih = 112;
+  const int iw = 112;
+  const int fw = 5;
+
+#endif
+
+          const int fb = oc;
+          const int ic = oc;
+          const int fh = fw;
+
+          const int oh = ConvOutputSize(ih, fh, dilation, pad, pad, stride);
+          const int ow = ConvOutputSize(iw, fw, dilation, pad, pad, stride);
+
+          VLOG(4) << "to get kernel ...";
+          auto kernels =
+              KernelRegistry::Global().Create("depthwise_conv2d",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(kernels.empty());
+
+          auto kernel = std::move(kernels.front());
+          VLOG(4) << "created depthconv2d kernel";
+
+          VLOG(4) << "prepare kernel ------";
+
+          lite::Tensor input, filter, bias, output;
+          operators::ConvParam param;
+          param.x = &input;
+          param.filter = &filter;
+          param.output = &output;
+          if (bias_flag) {
+            param.bias = &bias;
+          }
+          param.fuse_relu = relu_flag;
+
+          std::vector<int> paddings = {pad, pad, pad, pad};
+          std::vector<int> dilations = {dilation, dilation};
+
+          param.paddings = std::make_shared<std::vector<int>>(paddings);
+          param.dilations = std::make_shared<std::vector<int>>(dilations);
+          param.strides = std::vector<int>{stride, stride};
+
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          std::unique_ptr<KernelContext> depth_conv_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(depth_conv_context->As<OpenCLContext>()));
+          kernel->SetContext(std::move(depth_conv_context));
+
+          const DDim& input_dim =
+              lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+
+          const DDim& filter_dim =
+              lite::DDim{std::vector<int64_t>({fb, fc, fh, fw})};
+          const DDim& out_dim =
+              lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+          // element wise bias
+          const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+
+          param.x->Resize(input_dim);
+          param.filter->Resize(filter_dim);
+          param.output->Resize(out_dim);
+          if (bias_flag) {
+            param.bias->Resize(bias_dim);
+          }
+
+          kernel->SetParam(param);
+
+          size_t input_image_width = iw * ((ic + 3) / 4);
+          size_t input_image_height = ih * batch_size;
+
+          size_t out_image_width = ow * ((oc + 3) / 4);
+          size_t out_image_height = oh * batch_size;
+
+          size_t bias_image_width = ow * ((oc + 3) / 4);
+          size_t bias_image_height = oh * batch_size;
+
+          size_t filter_image_width = fw * ((fb + 3) / 4);
+          size_t filter_image_height = fc * fh;
+
+          const size_t cl_image2d_row_pitch{0};
+          const size_t cl_image2d_slice_pitch{0};
+
+          std::default_random_engine engine;
+          std::uniform_real_distribution<float> gen(-5, 5);
+
+          std::vector<float> input_v(batch_size * ic * ih * iw);
+          std::vector<float> filter_v(fb * fc * fh * fw);
+          std::vector<float> output_v(batch_size * oc * ih * iw);
+          std::vector<float> bias_v(oc);
+
+          VLOG(4) << "gen input and filter ...";
+
+          for (auto& i : input_v) {
+            i = gen(engine);
+          }
+          for (auto& f : filter_v) {
+            f = gen(engine);
+          }
+
+          VLOG(4) << "after gen input and filter ...";
+          VLOG(4) << "input_v.size(): " << input_v.size();
+          VLOG(4) << "filter_v.size(): " << filter_v.size();
+          VLOG(4) << "output_v.size(): " << output_v.size();
+          VLOG(4) << "bias_v.size(): " << bias_v.size();
+          VLOG(4) << "input_dim.production(): " << input_dim.production();
+          VLOG(4) << "filter_dim.production(): " << filter_dim.production();
+          VLOG(4) << "out_dim.production(): " << out_dim.production();
+          VLOG(4) << "bias_dim.production(): " << bias_dim.production();
+          VLOG(4) << "4 * input_image_height * input_image_width: "
+                  << 4 * input_image_height * input_image_width;
+          VLOG(4) << "4 * filter_image_width * filter_image_height: "
+                  << 4 * filter_image_width * filter_image_height;
+
+          CHECK(input_dim.production() == input_v.size());
+          CHECK_LE(input_dim.production(),
+                   4 * input_image_height * input_image_width);
+          CHECK(filter_dim.production() == filter_v.size());
+          CHECK_LE(filter_dim.production(),
+                   4 * filter_image_width * filter_image_height);
+
+          paddle::lite::CLImageConverterDefault default_convertor;
+          VLOG(4) << "set mapped input  ...";
+          std::vector<float> x_image_v(input_image_width * input_image_height *
+                                       4);  // 4 : RGBA
+          std::vector<float> filter_image_v(
+              filter_image_width * filter_image_height * 4);  // 4 : RGBA
+          std::vector<float> bias_image_v(bias_image_width * bias_image_height *
+                                          4);  // 4 : RGBA
+          std::vector<float> out_image_v(out_image_width * out_image_height *
+                                         4);  // 4 : RGBA
+
+          default_convertor.NCHWToImage(
+              input_v.data(), x_image_v.data(), input_dim);
+
+          VLOG(4) << "set mapped filter  ...";
+          paddle::lite::CLImageConverterNWBlock nw_convertor;
+          nw_convertor.NCHWToImage(
+              filter_v.data(), filter_image_v.data(), filter_dim);
+
+          auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+              input_image_width, input_image_height, x_image_v.data());
+          auto* filter_image2d = filter.mutable_data<float, cl::Image2D>(
+              filter_image_width, filter_image_height, filter_image_v.data());
+
+          if (bias_flag) {
+            nw_convertor.NCHWToImage(
+                filter_v.data(), filter_image_v.data(), filter_dim);
+
+            for (int i = 0; i < bias_dim.production(); ++i) {
+              bias_v[i] = static_cast<int>(gen(engine));
+            }
+            CLImageConverterFolder folder_convertor;
+            folder_convertor.NCHWToImage(
+                bias_v.data(), bias_image_v.data(), bias_dim);
+            auto* bias_data = bias.mutable_data<float, cl::Image2D>(
+                bias_image_width, bias_image_height, bias_image_v.data());
+          }
+
+          VLOG(4) << "resize output  ...";
+          output.Resize(out_dim);
+
+          // cpu conv basic calc
+          lite::Tensor out_ref;
+          out_ref.Resize(out_dim);
+
+          VLOG(4) << "prepare kernel ready";
+
+          VLOG(4) << "kernel launch ...";
+          kernel->Launch();
+          VLOG(4) << "mutable output ...";
+          auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+              out_image_width, out_image_height);
+
+          auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+          auto* out_ptr = param.output->data<float, cl::Image2D>();
+          auto it = wait_list->find(out_ptr);
+
+          if (it != wait_list->end()) {
+            VLOG(4) << "--- Find the sync event for the target cl "
+                       "tensor. ---";
+            auto& event = *(it->second);
+            event.wait();
+          } else {
+            LOG(FATAL) << "Could not find the sync event for the target "
+                          "cl tensor.";
+          }
+
+          TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                      output.data<float, cl::Image2D>(),
+                                      out_image_width,
+                                      out_image_height,
+                                      cl_image2d_row_pitch,
+                                      cl_image2d_slice_pitch,
+                                      IoDirection::DtoH);
+          DDim out_image_shape =
+              default_convertor.InitImageDimInfoWith(output.dims());
+
+          default_convertor.ImageToNCHW(out_image_v.data(),
+                                        output_v.data(),
+                                        out_image_shape,
+                                        output.dims());
+
+          // for (int j = 0; j < input_v.size(); j += 1) {
+          //   VLOG(4) << "input_v input[" << j
+          //           << "]: " << input_v.data()[j];
+          //       std::cout<< j << "  " << input_v.data()[j] << std::endl;
+          // }
+          // std::cout << std::endl;
+
+          // for (int j = 0; j < output_v.size(); j += 1) {
+          //   VLOG(4) << "output_v output_v[" << j
+          //           << "]:" << output_v.data()[j];
+          //       std::cout << j << "  " << output_v.data()[j] <<
+          //       std::endl;
+          // }
+
+          VLOG(4) << "mutable_data out_ref_data: ";
+
+          // run cpu ref
+          auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+
+          VLOG(4) << " conv_basic beigin ..... ";
+          depth_conv<float, 1, 1>(input_v.data(),
+                                  input.dims(),
+                                  filter_v.data(),
+                                  filter.dims(),
+                                  out_ref_data,
+                                  out_dim);
+          VLOG(4) << " conv_basic end ..... ";
+
+          VLOG(4) << " input_dim: " << input_dim;
+          VLOG(4) << " filter_dim: " << filter_dim;
+          const DDim& out_image_dims = lite::DDim{
+              std::vector<int64_t>({static_cast<int64_t>(out_image_width),
+                                    static_cast<int64_t>(out_image_height)})};
+
+          for (int i = 0; i < out_dim.production(); i++) {
+            EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+            if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+              LOG(FATAL) << "error idx:" << i;
+            }
+          }
+
+#ifdef LOOP_TEST
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+
+TEST(depthwise_conv2d_image2d_fp16, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFloat),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel";
+  lite::Tensor input, filter, output;
+  operators::ConvParam param;
+  param.x = &input;
+  param.filter = &filter;
+  param.output = &output;
+  std::vector<int> paddings = {0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  param.strides = std::vector<int>{1, 1};
+  std::vector<int> dilations = {1, 1};
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> dep_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(dep_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(dep_context));
+
+  LOG(INFO) << "kernel ready";
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> gen(-5, 5);
+  std::vector<float> input_v(1 * 32 * 112 * 112);
+  std::vector<float> filter_v(32 * 1 * 3 * 3);
+  for (auto& i : input_v) {
+    i = gen(engine);
+  }
+  for (auto& f : filter_v) {
+    f = gen(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  input.Resize({1, 32, 112, 112});
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim input_image_shape =
+      default_converter->InitImageDimInfoWith(input.dims());
+  LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
+            << input_image_shape[1];
+  std::vector<float> input_image_data(input_image_shape.production() *
+                                      4);  // 4 : RGBA
+  default_converter->NCHWToImage(
+      input_v.data(), input_image_data.data(), input.dims());
+  auto* input_image = input.mutable_data<int16_t, cl::Image2D>(
+      input_image_shape[0], input_image_shape[1], input_image_data.data());
+
+  LOG(INFO) << "prepare kernel";
+  filter.Resize({32, 1, 3, 3});
+  CLImageConverterNWBlock* nw_converter = new CLImageConverterNWBlock();
+  DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
+  LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
+            << filter_image_shape[1];
+  std::vector<float> filter_image_data(filter_image_shape.production() *
+                                       4);  // 4 : RGBA
+  nw_converter->NCHWToImage(
+      filter_v.data(), filter_image_data.data(), filter.dims());
+  auto* filter_image = filter.mutable_data<int16_t, cl::Image2D>(
+      filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
+
+  LOG(INFO) << "launch";
+  output.Resize({1, 32, 110, 110});
+  DDim output_image_shape =
+      default_converter->InitImageDimInfoWith(output.dims());
+  LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
+            << output_image_shape[1];
+  auto* output_image = output.mutable_data<int16_t, cl::Image2D>(
+      output_image_shape[0], output_image_shape[1]);
+
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<int16_t, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    LOG(INFO) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+    LOG(INFO) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  lite::Tensor output_ref;
+  output_ref.Resize({1, 32, 110, 110});
+  auto* output_ref_data = output_ref.mutable_data<float>(TARGET(kARM));
+  depth_conv<float, 1, 1>(input_v.data(),
+                          input.dims(),
+                          filter_v.data(),
+                          filter.dims(),
+                          output_ref_data,
+                          output_ref.dims());
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  float* output_image_data = new float[output_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(output_image_data,
+                              output_image,
+                              output_image_shape[0],
+                              output_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+
+  float* output_data = new float[output_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      output_image_data, output_data, output_image_shape, output.dims());
+
+  LOG(INFO) << "output_data vs output_ref_data";
+  for (int i = 0; i < output.dims().production(); i++) {
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
+    LOG(INFO) << output_data[i] << " " << output_ref_data[i];
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/elementwise_add_compute.cc b/lite/kernels/opencl/elementwise_add_compute.cc
index ad831010f81a240c7eec0c3b3b36f070628636e9..72838b7c49fceec72a34cba242014cb659aeb5d0 100644
--- a/lite/kernels/opencl/elementwise_add_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_compute.cc
@@ -23,6 +23,8 @@ namespace lite {
 namespace kernels {
 namespace opencl {
 
+/* Buffer */
+#if 0
 void ElementwiseAddCompute::PrepareForRun() {
   auto& context = ctx_->As<OpenCLContext>();
   context.cl_context()->AddKernel(
@@ -92,6 +94,124 @@ void ElementwiseAddCompute::UpdateParams() {
   VLOG(4) << "channels: " << channels_;
   VLOG(4) << "num: " << num_;
 }
+#endif
+
+/* Image2D */
+void ElementwiseAddImageCompute::PrepareForRun() {
+  ele_param_ = param_.get_mutable<param_t>();
+  auto* x = ele_param_->X;
+  auto* y = ele_param_->Y;
+  auto axis = ele_param_->axis;
+
+  if (y->dims().size() == 4) {
+    kernel_func_name_ = "elementwise_add";  // y: ImageDefault
+  } else if (y->dims().size() == 1) {
+    if (axis == x->dims().size() - 1) {
+      kernel_func_name_ = "width_add";  // y: ImageDefault
+    } else if (axis == x->dims().size() - 3) {
+      kernel_func_name_ = "channel_add";  // y: ImageFolder
+    } else {
+      LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+                 << ", x->dims().size():" << x->dims().size()
+                 << ", y->dims.size():" << y->dims().size();
+    }
+  } else {
+    LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+               << ", x->dims().size():" << x->dims().size()
+               << ", y->dims.size():" << y->dims().size();
+  }
+  VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
+
+  auto& context = ctx_->As<OpenCLContext>();
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+}
+
+void ElementwiseAddImageCompute::Run() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+
+  auto* x = ele_param_->X;
+  auto* y = ele_param_->Y;
+  auto* out = ele_param_->Out;
+  auto axis = ele_param_->axis;
+
+  VLOG(4) << "x->target():" << TargetToStr(x->target());
+  VLOG(4) << "y->target():" << TargetToStr(y->target());
+  VLOG(4) << "out->target():" << TargetToStr(out->target());
+  VLOG(4) << "x->dims():" << x->dims();
+  VLOG(4) << "y->dims():" << y->dims();
+  VLOG(4) << "out->dims():" << out->dims();
+  VLOG(4) << "axis:" << axis;
+
+  paddle::lite::CLImageConverterDefault default_convertor;
+  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
+  auto x_img_width = x_img_shape[0];
+  auto x_img_height = x_img_shape[1];
+  auto out_img_shape =
+      default_convertor.InitImageDimInfoWith(out->dims());  // w, h
+  auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
+
+  auto* x_img = x->data<float, cl::Image2D>();
+  auto* y_img = y->data<float, cl::Image2D>();
+  auto* out_img =
+      out->mutable_data<float, cl::Image2D>(out_img_shape[0], out_img_shape[1]);
+
+  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
+  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
+  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
+          << out_img_shape[1];
+
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+  int arg_idx = 0;
+  auto y_dims = y->dims();
+  if (y_dims.size() == 4) {
+    cl_int status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+  } else if (y_dims.size() == 1) {
+    if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
+      int tensor_w = x->dims()[x->dims().size() - 1];
+      VLOG(4) << "tensor_w:" << tensor_w;
+
+      cl_int status = kernel.setArg(arg_idx, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      CL_CHECK_FATAL(status);
+    } else {
+      LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+                 << ", x->dims().size():" << x->dims().size()
+                 << ", y->dims.size():" << y->dims().size();
+    }
+  } else {
+    LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+               << ", x->dims().size():" << x->dims().size()
+               << ", y->dims.size():" << y->dims().size();
+  }
+
+  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
+                                      static_cast<cl::size_type>(x_img_height)};
+  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_img, event_);
+}
 
 }  // namespace opencl
 }  // namespace kernels
@@ -99,9 +219,36 @@ void ElementwiseAddCompute::UpdateParams() {
 }  // namespace paddle
 
 namespace ocl = paddle::lite::kernels::opencl;
-REGISTER_LITE_KERNEL(
-    elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+
+// REGISTER_LITE_KERNEL(
+//    elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
+//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .Finalize();
+
+// TODO(ysh329): Not fix.
+// "Y" may from constant value like conv bias (kARM, need do cl_image_converter
+// on CPU);
+//     may from anther branch like "X" (kOpenCL, nothing to do).
+// Consider 2 situations have different actions when pass running(pick kernel),
+//     set target of "Y" as kOpenCL temporarily.
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     ocl::ElementwiseAddImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/elementwise_add_compute.h b/lite/kernels/opencl/elementwise_add_compute.h
index 2f41dfaa2b2f88977a5f56b3c33b556fb06c9125..efc7f58f44a066a171b07b497237c4f782c1607c 100644
--- a/lite/kernels/opencl/elementwise_add_compute.h
+++ b/lite/kernels/opencl/elementwise_add_compute.h
@@ -33,6 +33,10 @@ class ElementwiseAddCompute
 
   void Run() override;
 
+  std::string doc() const override {
+    return "ElementwiseAdd using cl::Buffer, kFloat";
+  }
+
  protected:
   void UpdateParams();
 
@@ -41,7 +45,29 @@ class ElementwiseAddCompute
   size_t num_{1};
   param_t* ele_param_{nullptr};
   std::string kernel_func_name_{"elementwise_add"};
-  std::string build_options_{"-DCL_DTYPE=float"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class ElementwiseAddImageCompute
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  std::string doc() const override {
+    return "ElementwiseAdd using cl::Image2D, kFloat";
+  }
+
+ protected:
+  param_t* ele_param_{nullptr};
+  std::string kernel_func_name_{"elementwise_add"};
+  std::string build_options_{" -DCL_DTYPE_float"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/elementwise_add_compute_test.cc b/lite/kernels/opencl/elementwise_add_compute_test.cc
index 69df2313bb93e7eb571858537eddeb1f6014d005..06f946bca77f2bc43493d2bb7d86d134a030eac5 100644
--- a/lite/kernels/opencl/elementwise_add_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_compute_test.cc
@@ -22,6 +22,19 @@
 namespace paddle {
 namespace lite {
 
+template <typename dtype>
+void fill_data(dtype *x, const int length, int set_value = -1) {
+  if (set_value == -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = idx;
+    }
+  } else if (set_value != -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = set_value;
+    }
+  }
+}
+
 template <typename dtype>
 void elementwise_compute_ref(const dtype *x_data,
                              const dtype *y_data,
@@ -46,25 +59,17 @@ void elementwise_compute_ref(const dtype *x_data,
   for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
     num *= x_dims[i];
   }
+  VLOG(4) << "axis:" << axis;
+  VLOG(4) << "batch:" << batch;
+  VLOG(4) << "cahnnels:" << channels;
+  VLOG(4) << "num:" << num;
   // do elementwise add/sub/max/...
-  if (elt_type == "add") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype *din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype *dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr + diny_data;
-          if (use_relu) {
-            *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
-          }
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
+  if (elt_type == "add" && axis == 1 && y_dims.size() == 1) {
+    for (int i = 0; i < x_dims.production(); ++i) {
+      auto w = i % y_dims.production();
+      out_data[i] = x_data[i] + y_data[w];
     }
-  } else if (elt_type == "sub") {
+  } else if (elt_type == "add") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -72,7 +77,7 @@ void elementwise_compute_ref(const dtype *x_data,
         const dtype diny_data = y_data[j];
         dtype *dout_ptr = out_data + offset;
         for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr - diny_data;
+          *dout_ptr = *din_ptr + diny_data;
           if (use_relu) {
             *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
           }
@@ -86,7 +91,9 @@ void elementwise_compute_ref(const dtype *x_data,
   }
 }
 
-TEST(elementwise_add, compute) {
+// buffer
+#if 0
+TEST(elementwise_add_buffer, compute) {
   LOG(INFO) << "to get kernel ...";
   auto kernels = KernelRegistry::Global().Create(
       "elementwise_add", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
@@ -163,7 +170,7 @@ TEST(elementwise_add, compute) {
   TargetWrapperCL::Unmap(out_data, mapped_out);
 }
 
-TEST(fusion_elementwise_add_activation, compute) {
+TEST(fusion_elementwise_add_activation_buffer, compute) {
   LOG(INFO) << "to get kernel ...";
   auto kernels =
       KernelRegistry::Global().Create("fusion_elementwise_add_activation",
@@ -243,9 +250,204 @@ TEST(fusion_elementwise_add_activation, compute) {
   }
   TargetWrapperCL::Unmap(out_data, mapped_out);
 }
+#endif
+
+// image
+TEST(elementwise_add_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img on cpu) -> "
+               "elementwise_add(img) -> "
+               "layout(img2buf on cpu) "
+               "-> host";
+
+  // elementwise_add's 3 kernels selection routing strategy:
+  // --------------------------------------------------------
+  //  1. elementwise_add: Need y_dim.size() == 4
+  //  2. elementwise_add (used by fuse_elementwise_activation op):
+  //                      Need y_dim.size() == 4 && act_type == "relu"
+  //  3. width_add:       Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
+  //  3
+  //  4. channel_add:     Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
+  //  1
+
+  // dims
+  const int n = 1;
+  const int c = 3;
+  const int h = 2;
+  const int w = 2;
+
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+  auto out_dim = x_dim;
+  // y_dim / axis / relu_flag
+  std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{w}),
+                            DDim(std::vector<DDim::value_type>{w})};
+  std::vector<int> axis_v{-1, -1, 3, 1};
+  std::vector<bool> relu_flag_v{false, true, false, false};
+  CHECK(y_dim_v.size() == axis_v.size() && axis_v.size() == relu_flag_v.size())
+      << "y_dim_v.size() == axis_v.size() == relu_flag_v.size() should be "
+         "same, and be corresponding "
+         "one by one";
+
+  // start loop
+  for (size_t case_idx = 0; case_idx < y_dim_v.size(); ++case_idx) {
+    auto y_dim = y_dim_v[case_idx];
+    auto axis = axis_v[case_idx];
+    auto relu_flag = relu_flag_v[case_idx];
+    LOG(INFO) << "================== elementwise_add, case_idx:" << case_idx + 1
+              << "/" << y_dim_v.size() << " ===================";
+    LOG(INFO) << "x_dim:" << x_dim;
+    LOG(INFO) << "y_dim:" << y_dim;
+    LOG(INFO) << "out_dim:" << out_dim;
+    LOG(INFO) << "axis:" << axis;
+    LOG(INFO) << "relu_flag:" << relu_flag;
+
+    // tensor
+    VLOG(4) << "set tensors about op param";
+    lite::Tensor eleadd_x, eleadd_y, eleadd_out;
+    eleadd_x.Resize(x_dim);
+    eleadd_y.Resize(y_dim);
+    eleadd_out.Resize(out_dim);
+
+    // initialize tensors
+    VLOG(4) << "initialize tensors";
+    paddle::lite::CLImageConverterDefault default_convertor;
+    // x
+    std::vector<float> x_v(x_dim.production());
+    fill_data<float>(x_v.data(), x_v.size());  // fill with index value
+    auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim);  // w, h
+    auto x_img_w = x_img_shape[0];
+    auto x_img_h = x_img_shape[1];
+    std::vector<float> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
+    default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
+    eleadd_x.mutable_data<float, cl::Image2D>(x_img_w, x_img_h, x_img_v.data());
+
+    // y
+    std::vector<float> y_v(y_dim.production());
+    fill_data<float>(y_v.data(), y_v.size());  // fill with index value
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim);  // w, h
+    auto y_img_w = y_img_shape[0];
+    auto y_img_h = y_img_shape[1];
+    std::vector<float> y_img_v(y_img_shape[0] * y_img_shape[1] * 4);  // 4: RGBA
+    default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
+    eleadd_y.mutable_data<float, cl::Image2D>(y_img_w, y_img_h, y_img_v.data());
+
+    // out
+    auto out_img_shape =
+        default_convertor.InitImageDimInfoWith(out_dim);  // w, h
+    auto out_img_w = out_img_shape[0];
+    auto out_img_h = out_img_shape[1];
+    eleadd_out.mutable_data<float, cl::Image2D>(out_img_w, out_img_h);
+
+    std::vector<float> out_img_v(out_img_w * out_img_h * 4);
+    fill_data<float>(
+        out_img_v.data(), out_img_v.size(), 0);  // fill with zero value
+
+    std::vector<float> out_v(out_dim.production());
+
+    // operator param
+    operators::FusionElementwiseActivationParam
+        fuseEleaddParam;  // enabled if relu_flag is true
+    fuseEleaddParam.X = &eleadd_x;
+    fuseEleaddParam.Y = &eleadd_y;
+    fuseEleaddParam.Out = &eleadd_out;
+    fuseEleaddParam.axis = axis;
+    fuseEleaddParam.act_type = relu_flag ? "relu" : "";
+
+    operators::ElementwiseParam eleaddParam;
+    eleaddParam.X = &eleadd_x;
+    eleaddParam.Y = &eleadd_y;
+    eleaddParam.Out = &eleadd_out;
+    eleaddParam.axis = axis;
+
+    auto op_param = relu_flag ? fuseEleaddParam : eleaddParam;
+
+    // set kernel
+    auto eleadd_img_kernels =
+        KernelRegistry::Global().Create("elementwise_add",
+                                        TARGET(kOpenCL),
+                                        PRECISION(kFloat),
+                                        DATALAYOUT(kImageDefault));
+    ASSERT_FALSE(eleadd_img_kernels.empty());
+
+    auto eleadd_img_kernel = std::move(eleadd_img_kernels.front());
+    VLOG(4) << "get eleadd kernel: " << eleadd_img_kernel->doc();
+
+    // set context and kernel args
+    VLOG(4) << "set context and kernel args";
+    std::unique_ptr<KernelContext> context(new KernelContext);
+    context->As<OpenCLContext>().InitOnce();
+
+    eleadd_img_kernel->SetParam(op_param);
+    std::unique_ptr<KernelContext> eleadd_img_context(new KernelContext);
+    context->As<OpenCLContext>().CopySharedTo(
+        &(eleadd_img_context->As<OpenCLContext>()));
+    eleadd_img_kernel->SetContext(std::move(eleadd_img_context));
+
+    // run kernel
+    VLOG(4) << "run kernel";
+    eleadd_img_kernel->Launch();
+
+    // download gpu result to cpu
+    const size_t cl_image2d_row_pitch{0};
+    const size_t cl_image2d_slice_pitch{0};
+    TargetWrapperCL::ImgcpySync(out_img_v.data(),
+                                eleadd_out.data<float, cl::Image2D>(),
+                                out_img_w,
+                                out_img_h,
+                                cl_image2d_row_pitch,
+                                cl_image2d_slice_pitch,
+                                IoDirection::DtoH);
+    default_convertor.ImageToNCHW(
+        out_img_v.data(), out_v.data(), out_img_shape, out_dim);
+
+    // compute cpu reference
+    std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+    elementwise_compute_ref<float>(x_v.data(),
+                                   y_v.data(),
+                                   out_ref.get(),
+                                   x_dim,
+                                   y_dim,
+                                   op_param.axis,
+                                   "add",
+                                   relu_flag);
+
+#if 0  // enable to check value of x and y
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                  << out_dim.production() << ", x_v[" << eidx << "]:"
+                  << x_v[eidx] << ", value[" << eidx << "]:" << value
+                  << ", ref_value[" << eidx << "]:" << ref_value;
+    }
+
+    for (int i = 0; i < y_v.size(); i++) {
+      LOG(INFO) << "y_v[" << i << "]:" << y_v[i];
+    }
+#endif
+
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+      EXPECT_NEAR(value, ref_value, 1e-6);
+      if (abs(value - ref_value) > 1e-6) {
+        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                  << out_dim.production() << ", value[" << eidx << "]:" << value
+                  << ", ref_value[" << eidx << "]:" << ref_value;
+        break;
+      }
+    }
+  }
+}
 
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fusion_elementwise_add_activation, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(fusion_elementwise_add_activation, kOpenCL, kFloat, kNCHW,
+// def);
+
+USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kImageDefault, def);
+USE_LITE_KERNEL(
+    fusion_elementwise_add_activation, kOpenCL, kFloat, kImageDefault, def);
diff --git a/lite/kernels/opencl/elementwise_mul_compute.cc b/lite/kernels/opencl/elementwise_mul_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ab1bf5c2e3162b08d4ecc4f3010f968f9327c013
--- /dev/null
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/opencl/elementwise_mul_compute.h"
+#include <memory>
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/op_registry.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+void ElementwiseMulFloatImageCompute::PrepareForRun() {
+  ele_param_ = param_.get_mutable<param_t>();
+  auto* y = ele_param_->Y;
+  auto y_dims = y->dims();
+  if (y_dims == ele_param_->X->dims()) {
+    kernel_func_name_ = "elementwise_mul";
+  } else if (y_dims.size() == 1) {
+    kernel_func_name_ = "channel_mul_d1";
+  } else if (y_dims.size() == 2) {
+    kernel_func_name_ = "channel_mul_d2";
+  } else if (y_dims.size() == 4) {
+    kernel_func_name_ = "channel_mul_d4";
+  } else {
+    LOG(FATAL) << "ElementwiseMul not supported y_dims.size():" << y_dims.size()
+               << ", x_dims.size():" << ele_param_->X->dims().size();
+  }
+  VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
+  VLOG(4) << "y_dims:" << y_dims;
+  VLOG(4) << "y_dims.size():" << y_dims.size();
+
+  auto& context = ctx_->As<OpenCLContext>();
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
+}
+
+void ElementwiseMulFloatImageCompute::Run() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+
+  auto* x = ele_param_->X;
+  auto* y = ele_param_->Y;
+  auto* out = ele_param_->Out;
+
+  VLOG(4) << "x->target():" << TargetToStr(x->target());
+  VLOG(4) << "y->target():" << TargetToStr(y->target());
+  VLOG(4) << "out->target():" << TargetToStr(out->target());
+  VLOG(4) << "x->dims():" << x->dims();
+  VLOG(4) << "y->dims():" << y->dims();
+  VLOG(4) << "out->dims():" << out->dims();
+
+  paddle::lite::CLImageConverterDefault default_convertor;
+  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
+  auto x_img_width = x_img_shape[0];
+  auto x_img_height = x_img_shape[1];
+  auto out_img_shape =
+      default_convertor.InitImageDimInfoWith(out->dims());  // w, h
+  auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
+
+  auto* x_img = x->data<float, cl::Image2D>();
+  auto* y_img = y->data<float, cl::Image2D>();
+  auto* out_img =
+      out->mutable_data<float, cl::Image2D>(out_img_shape[0], out_img_shape[1]);
+
+  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
+  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
+  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
+          << out_img_shape[1];
+
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+  int arg_idx = 0;
+  auto y_dims = y->dims();
+  if (y_dims == ele_param_->X->dims()) {
+    // kernel: elementwise_mul(channel_mul_d4)
+    cl_int status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+  } else if (y_dims.size() == 1 || y_dims.size() == 4) {
+    auto tensor_w = x->dims()[x->dims().size() - 1];
+    VLOG(4) << "tensor_w:" << tensor_w;
+    // kernel: channel_mul_d1 / channel_mul_d4
+    cl_int status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+    CL_CHECK_FATAL(status);
+  } else if (y_dims.size() == 2) {
+    auto y_tensor_h = y->dims()[0];
+    auto y_tensor_w = y->dims()[1];
+    VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h;
+    // kernel: channel_mul_d2
+    cl_int status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
+    CL_CHECK_FATAL(status);
+  } else {
+    LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
+               << y_dims.size();
+  }
+
+  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
+                                      static_cast<cl::size_type>(x_img_height)};
+  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_img, event_);
+
+  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     ocl::ElementwiseMulFloatImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/elementwise_mul_compute.h b/lite/kernels/opencl/elementwise_mul_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ef968b0282964c090577e3c597ea436892ec7c9
--- /dev/null
+++ b/lite/kernels/opencl/elementwise_mul_compute.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/core/kernel.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ElementwiseMulFloatImageCompute
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  std::string doc() const override {
+    return "ElementwiseMul using cl::Image2D(ImageDefault/RGBA), kFP32";
+  }
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ protected:
+  param_t* ele_param_{nullptr};
+  std::string kernel_func_name_{"elementwise_mul"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/opencl/elementwise_mul_compute_test.cc b/lite/kernels/opencl/elementwise_mul_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1951d9fb03407d493f58d82e7697f3ea15cc6cf1
--- /dev/null
+++ b/lite/kernels/opencl/elementwise_mul_compute_test.cc
@@ -0,0 +1,252 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void fill_data(dtype *x, const int length, int set_value = -1) {
+  if (set_value == -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = idx;
+    }
+  } else if (set_value != -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = set_value;
+    }
+  }
+}
+
+template <typename dtype>
+void elementwise_compute_ref(const dtype *x_data,
+                             const dtype *y_data,
+                             dtype *out_data,
+                             const DDim &x_dims,
+                             const DDim &y_dims,
+                             int axis,
+                             const std::string elt_type,
+                             bool use_relu = false) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+
+  if (x_dims == y_dims || y_dims.size() == 2 || y_dims.size() == 1) {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype *din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype *dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr * diny_data;
+          if (use_relu) {
+            *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
+          }
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (y_dims.size() == 4) {
+    // eg: x_dims: [1, 3, 2, 2]
+    //     y_dims: [1, 3, 1, 1]
+    ASSERT_EQ(y_dims[2], y_dims[3]);
+    ASSERT_EQ(y_dims[2], 1);
+    ASSERT_EQ(y_dims[0], 1);
+    auto y_offset = y_dims.production();
+    auto x_offset = x_dims.production() / y_offset;
+    for (auto x = 0; x < x_dims.production(); ++x) {
+      auto y = x / x_offset;
+      out_data[x] = x_data[x] * y_data[y];
+    }
+  } else {
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type << std::endl;
+  }
+}
+
+// #define PRINT_RESULT
+TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
+  LOG(INFO)
+      << "main steps of test: host -> layout(buf2img on cpu) -> elemul(img) -> "
+         "layout(img2buf on cpu) "
+         "-> host";
+
+  // dims
+  const int n = 1;
+  const int c = 3;
+  const int h = 2;
+  const int w = 2;
+
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+  auto out_dim = x_dim;
+  std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, 1, 1}),
+                            DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{h, w}),
+                            DDim(std::vector<DDim::value_type>{w})};
+  for (auto y_dim : y_dim_v) {
+    LOG(INFO) << "================== elementwise_mul ===================";
+    LOG(INFO) << "x_dim:" << x_dim << "\ty_dim:" << y_dim
+              << "\tout_dim:" << out_dim;
+
+    // tensor
+    LOG(INFO) << "set tensors about op param";
+    lite::Tensor elemul_x, elemul_y, elemul_out;
+    elemul_x.Resize(x_dim);
+    elemul_y.Resize(y_dim);
+    elemul_out.Resize(out_dim);
+
+    // initialize tensors
+    VLOG(4) << "initialize tensors";
+    paddle::lite::CLImageConverterDefault default_convertor;
+    // x
+    std::vector<float> x_v(x_dim.production());
+    fill_data<float>(x_v.data(), x_v.size());  // fill with index value
+    auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim);  // w, h
+    auto x_img_w = x_img_shape[0];
+    auto x_img_h = x_img_shape[1];
+    std::vector<float> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
+    default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
+    elemul_x.mutable_data<float, cl::Image2D>(x_img_w, x_img_h, x_img_v.data());
+
+    // y
+    std::vector<float> y_v(y_dim.production());
+    fill_data<float>(y_v.data(), y_v.size());  // fill with index value
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim);  // w, h
+    auto y_img_w = y_img_shape[0];
+    auto y_img_h = y_img_shape[1];
+    std::vector<float> y_img_v(y_img_shape[0] * y_img_shape[1] * 4);  // 4: RGBA
+    default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
+    elemul_y.mutable_data<float, cl::Image2D>(y_img_w, y_img_h, y_img_v.data());
+
+    // out
+    auto out_img_shape =
+        default_convertor.InitImageDimInfoWith(out_dim);  // w, h
+    auto out_img_w = out_img_shape[0];
+    auto out_img_h = out_img_shape[1];
+    elemul_out.mutable_data<float, cl::Image2D>(out_img_w, out_img_h);
+
+    std::vector<float> out_img_v(out_img_w * out_img_h * 4);
+    fill_data<float>(
+        out_img_v.data(), out_img_v.size(), 0);  // fill with zero value
+
+    std::vector<float> out_v(out_dim.production());
+
+    // operator param
+    operators::ElementwiseParam elemulParam;
+    elemulParam.X = &elemul_x;
+    elemulParam.Y = &elemul_y;
+    elemulParam.Out = &elemul_out;
+    elemulParam.axis = -1;
+
+    // set kernel
+    auto elemul_img_kernels =
+        KernelRegistry::Global().Create("elementwise_mul",
+                                        TARGET(kOpenCL),
+                                        PRECISION(kFloat),
+                                        DATALAYOUT(kImageDefault));
+    ASSERT_FALSE(elemul_img_kernels.empty());
+
+    auto elemul_img_kernel = std::move(elemul_img_kernels.front());
+    VLOG(4) << "get elemul kernel: " << elemul_img_kernel->doc();
+
+    // set context and kernel args
+    VLOG(4) << "set context and kernel args";
+    std::unique_ptr<KernelContext> context(new KernelContext);
+    context->As<OpenCLContext>().InitOnce();
+
+    elemul_img_kernel->SetParam(elemulParam);
+    std::unique_ptr<KernelContext> elemul_img_context(new KernelContext);
+    context->As<OpenCLContext>().CopySharedTo(
+        &(elemul_img_context->As<OpenCLContext>()));
+    elemul_img_kernel->SetContext(std::move(elemul_img_context));
+
+    // run kernel
+    VLOG(4) << "run kernel";
+    elemul_img_kernel->Launch();
+
+    // download gpu result to cpu
+    const size_t cl_image2d_row_pitch{0};
+    const size_t cl_image2d_slice_pitch{0};
+    TargetWrapperCL::ImgcpySync(out_img_v.data(),
+                                elemul_out.data<float, cl::Image2D>(),
+                                out_img_w,
+                                out_img_h,
+                                cl_image2d_row_pitch,
+                                cl_image2d_slice_pitch,
+                                IoDirection::DtoH);
+    default_convertor.ImageToNCHW(
+        out_img_v.data(), out_v.data(), out_img_shape, out_dim);
+
+    // compute cpu reference
+    std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+    elementwise_compute_ref<float>(x_v.data(),
+                                   y_v.data(),
+                                   out_ref.get(),
+                                   x_dim,
+                                   y_dim,
+                                   elemulParam.axis,
+                                   "mul");
+
+#if 0  // enable to check value of x and y
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                  << out_dim.production() << ", x_v[" << eidx << "]:"
+                  << x_v[eidx] << ", value[" << eidx << "]:" << value
+                  << ", ref_value[" << eidx << "]:" << ref_value;
+    }
+
+    for (int i = 0; i < y_v.size(); i++) {
+      LOG(INFO) << "y_v[" << i << "]:" << y_v[i];
+    }
+#endif
+
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+      EXPECT_NEAR(value, ref_value, 1e-6);
+      if (abs(value - ref_value) > 1e-6) {
+        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                  << out_dim.production() << ", value[" << eidx << "]:" << value
+                  << ", ref_value[" << eidx << "]:" << ref_value;
+        break;
+      }
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFloat, kImageDefault, def);
diff --git a/lite/kernels/opencl/fc_compute_test.cc b/lite/kernels/opencl/fc_compute_test.cc
index 7f0c9c49a9920b10ceaa29cd1b548f59d5758f3b..863eab6297a88bcb2827c6ed09dfd1cecd7fae2d 100644
--- a/lite/kernels/opencl/fc_compute_test.cc
+++ b/lite/kernels/opencl/fc_compute_test.cc
@@ -66,6 +66,8 @@ void PrintData(std::string name, float* a, const int rows, const int cols) {
   }
 }
 
+// buffer
+#if 0  // fc_buffer
 // #define PRINT_RESULT
 #define LOOP_TEST
 TEST(fc, compute) {
@@ -193,8 +195,9 @@ TEST(fc, compute) {
   }      // m
 #endif
 }
+#endif  // fc_buffer
 
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc b/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
index ad17575d693862b02129ef0e506968f3cbabc42a..c6e1510efe075eb0998d087d35b841849cf99bf1 100644
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
@@ -20,6 +20,9 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
+
+/* Buffer */
+#if 0
 class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
  public:
   using param_t = operators::FusionElementwiseActivationParam;
@@ -38,19 +41,60 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
     }
   }
 };
+#endif
+
+class FusionElementwiseAddActivationImageCompute
+    : public ElementwiseAddImageCompute {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void PrepareForRun() override {
+    build_options_ += " -DRELU";
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+    ele_param_ = param_.get_mutable<param_t>();
+    auto act_t = static_cast<param_t*>(ele_param_)->act_type;
+    VLOG(4) << "act: " << act_t;
+    if (act_t != "relu") {
+      LOG(FATAL) << "Unsupported Activation type: " << act_t;
+    }
+  }
+};
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
 namespace ocl = paddle::lite::kernels::opencl;
+// REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+//                     kOpenCL,
+//                     kFloat,
+//                     kNCHW,
+//                     ocl::FusionElementwiseAddActivationCompute,
+//                     def)
+//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .Finalize();
+
 REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
                      kOpenCL,
                      kFloat,
-                     kNCHW,
-                     ocl::FusionElementwiseAddActivationCompute,
+                     kImageDefault,
+                     ocl::FusionElementwiseAddActivationImageCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/image_helper.h b/lite/kernels/opencl/image_helper.h
index d164f1ef777a02e5fd3bd33f5cab117de17834b8..d0d282250d1c5658bc8f684b52b4b0d140895833 100644
--- a/lite/kernels/opencl/image_helper.h
+++ b/lite/kernels/opencl/image_helper.h
@@ -40,6 +40,39 @@ static std::map<std::string, size_t> InitImageDimInfoWith(
   size_t height = H * N;
   return std::map<std::string, size_t>({{"width", width}, {"height", height}});
 }
+inline static int maptofactor(int i, int factor) {
+  return (i + factor - 1) / factor;
+}
+
+static std::vector<size_t> DefaultWorkSize(const DDim& image_dim,
+                                           const DDim& image_shape) {
+  // n c h w
+  //  auto image_dim = image.dims();
+  if (image_dim.size() == 4) {
+    auto n = image_dim[0];
+    auto h = image_dim[2];
+    auto w = image_dim[3];
+    auto image_width = image_shape[0];
+    size_t work_size_0 = image_width / w;
+    size_t work_size_1 = w;
+    size_t work_size_2 = n * h;
+    return {work_size_0, work_size_1, work_size_2};
+  } else if (image_dim.size() == 2) {
+    auto h = image_dim[0];
+    auto w = image_dim[1];
+    return {1,
+            static_cast<unsigned int>(image_shape[0]),
+            static_cast<unsigned int>(image_shape[1])};
+  } else if (image_dim.size() == 1) {
+    return {1, static_cast<unsigned int>(image_shape[0]), 1};
+  } else if (image_dim.size() == 3) {
+    size_t c = image_dim[0];
+    size_t h = image_dim[1];
+    size_t w = image_dim[2];
+    return {(c + 3) / 4, w, h};
+  }
+  LOG(FATAL) << " not support this dim, need imp ";
+}
 
 }  // namespace opencl
 }  // namespace kernels
diff --git a/lite/kernels/opencl/layout_compute.cc b/lite/kernels/opencl/layout_compute.cc
index a2869457fc8dabdfb39d3d447404c0a6f6f77375..2214a775b7703002cdde0c01867192c50607a66c 100644
--- a/lite/kernels/opencl/layout_compute.cc
+++ b/lite/kernels/opencl/layout_compute.cc
@@ -28,8 +28,11 @@ namespace lite {
 namespace kernels {
 namespace opencl {
 
-class LayoutComputeBufferChwToImage2DHwc
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC)> {
+// [NCHW] -> [ImageDefault]
+class LayoutComputeBufferChwToImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kAny),
+                        DATALAYOUT(kImageDefault)> {
  public:
   using param_t = operators::LayoutParam;
 
@@ -117,7 +120,8 @@ class LayoutComputeBufferChwToImage2DHwc
   }
 
   std::string doc() const override {
-    return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(RGBA)";
+    return "Trans Layout from cl::Buffer(NCHW) to "
+           "cl::Image2D(ImageDefault/RGBA)";
   }
 
  private:
@@ -126,7 +130,8 @@ class LayoutComputeBufferChwToImage2DHwc
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
-class LayoutComputeImage2DHwcToBufferChw
+// [ImageDefault] -> [NCHW]
+class LayoutComputeImageDefaultToBufferChw
     : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)> {
  public:
   using param_t = operators::LayoutParam;
@@ -180,7 +185,7 @@ class LayoutComputeImage2DHwcToBufferChw
     CL_CHECK_FATAL(status);
     status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
+    status = kernel.setArg(++arg_idx, static_cast<const int>(size_block));
     CL_CHECK_FATAL(status);
     status = kernel.setArg(++arg_idx, static_cast<const int>(size_batch));
     CL_CHECK_FATAL(status);
@@ -206,7 +211,8 @@ class LayoutComputeImage2DHwcToBufferChw
   }
 
   std::string doc() const override {
-    return "Trans Layout from cl::Image2D(RGBA) to cl::Buffer(NCHW)";
+    return "Trans Layout from cl::Image2D(ImageDefault/RGBA) to "
+           "cl::Buffer(NCHW)";
   }
 
  private:
@@ -215,20 +221,115 @@ class LayoutComputeImage2DHwcToBufferChw
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
+// [NCHW] -> [ImageDW]
+class LayoutComputeBufferChwToImage2DNw
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageNW)> {
+ public:
+  using param_t = operators::LayoutParam;
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = Param<param_t>();
+    auto* x_data = param.x->data<float, cl::Buffer>();
+    auto x_dims = param.x->dims();
+
+    CHECK(x_dims.size() == 4) << " Tensor dim is not 4.";
+    size_t image_width = x_dims[3] * ((x_dims[0] + 3) / 4);
+    size_t image_height = x_dims[1] * x_dims[2];
+
+    auto* y_data =
+        param.y->mutable_data<float, cl::Image2D>(image_width, image_height);
+    auto y_dims = param.y->dims();
+
+    // out info
+    std::vector<size_t> new_dims = {1, 1, 1, 1};
+    for (int tidx = 0; tidx < x_dims.size(); ++tidx) {
+      new_dims[4 - x_dims.size() + tidx] = x_dims[tidx];
+    }
+
+    const int out_N = new_dims[0];
+    const int out_C = new_dims[1];
+    const int out_H = new_dims[2];
+    const int out_W = new_dims[3];
+
+    const int Stride2 = out_C * out_H * out_W;
+    const int Stride1 = out_H * out_W;
+    const int Stride0 = out_W;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_data);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_data);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_H));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_W));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_N));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(Stride0));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(Stride1));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " "
+            << (out_C * out_H);
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4),  // N blocks
+                    static_cast<cl::size_type>(out_W),            // w
+                    static_cast<cl::size_type>(out_C * out_H)};   // ch
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(y_data, event_);
+    context.cl_context()->GetCommandQueue().finish();
+    //    auto image_shape = InitImageDimInfoWith(x_dims);
+  }
+
+  std::string doc() const override {
+    return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(ImageDW/CLNW)";
+  }
+
+ private:
+  std::string kernel_func_name_{"buffer_to_image2d_nw"};
+  std::string build_options_{"-DCL_DTYPE_float "};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-// BufferChwToImage2DHwc
-// [chw] -> [hwc]
+// [NCHW] -> [ImageDefault]
 REGISTER_LITE_KERNEL(
     layout,
     kOpenCL,
     kAny,
-    kNHWC,
-    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc,
-    buffer_chw_to_image2d_hwc_opencl_fp32)
+    kImageDefault,
+    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault,
+    NCHW_to_ImageDefault)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kAny),
@@ -236,17 +337,16 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kAny),
-                                       DATALAYOUT(kNHWC))})
+                                       DATALAYOUT(kImageDefault))})
     .Finalize();
 
-// [chw] -> [hwc]
 REGISTER_LITE_KERNEL(
     layout_once,
     kOpenCL,
     kAny,
-    kNHWC,
-    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc,
-    buffer_chw_to_image2d_hwc_opencl_fp32)
+    kImageDefault,
+    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault,
+    NCHW_to_ImageDefault)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kAny),
@@ -254,42 +354,58 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kAny),
-                                       DATALAYOUT(kNHWC))})
+                                       DATALAYOUT(kImageDefault))})
     .Finalize();
 
-// Image2DHwcBufferChw
-// [hwc] -> [chw]
+// [ImageDefault] -> [NCHW]
 REGISTER_LITE_KERNEL(
     layout,
     kOpenCL,
     kAny,
     kNCHW,
-    paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw,
-    image2d_hwc_to_buffer_chw_opencl_fp32)
+    paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw,
+    ImageDefault_to_NCHW)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kAny),
-                                      DATALAYOUT(kNHWC))})
+                                      DATALAYOUT(kImageDefault))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kAny),
                                        DATALAYOUT(kNCHW))})
     .Finalize();
 
-// [hwc] -> [chw]
 REGISTER_LITE_KERNEL(
     layout_once,
     kOpenCL,
     kAny,
     kNCHW,
-    paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw,
-    image2d_hwc_to_buffer_chw_opencl_fp32)
+    paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw,
+    ImageDefault_to_NCHW)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kAny),
-                                      DATALAYOUT(kNHWC))})
+                                      DATALAYOUT(kImageDefault))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kAny),
                                        DATALAYOUT(kNCHW))})
     .Finalize();
+
+// [NCHW] -> [ImageNW]
+REGISTER_LITE_KERNEL(
+    layout_once,
+    kOpenCL,
+    kFloat,
+    kImageNW,
+    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DNw,
+    NCHW_to_ImageNW)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageNW))})
+    .Finalize();
diff --git a/lite/kernels/opencl/layout_compute_test.cc b/lite/kernels/opencl/layout_compute_test.cc
index 3e8dd78f616d4d1e3fabf51ba8d3ddf43dd561f1..852ac91bac610d26de948d1743d01e63c2d4d411 100644
--- a/lite/kernels/opencl/layout_compute_test.cc
+++ b/lite/kernels/opencl/layout_compute_test.cc
@@ -24,7 +24,7 @@ namespace lite {
 
 // #define LOOP_TEST
 // #define PRINT_RESULT
-TEST(layout, compute) {
+TEST(layout_ImageDefault, compute) {
   LOG(INFO) << "main steps of test: host -> layout(buf2img) -> layout(img2buf) "
                "-> device";
 
@@ -34,17 +34,20 @@ TEST(layout, compute) {
       for (int h = 1; h <= 100; h += 13) {
         for (int w = 1; w <= 100; w += 17) {
 #else
-  const int n = 1;
-  const int c = 1;
-  const int h = 1;
-  const int w = 100;
+  const int n = 2;
+  const int c = 9;
+  const int h = 20;
+  const int w = 5;
 #endif  // LOOP_TEST
 
           LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
                     << h << " " << w << " ========";
           // set layout kernels
-          auto buf_to_img_kernels = KernelRegistry::Global().Create(
-              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC));
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
           auto img_to_buf_kernels = KernelRegistry::Global().Create(
               "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
           ASSERT_FALSE(buf_to_img_kernels.empty());
@@ -83,8 +86,7 @@ TEST(layout, compute) {
           auto* mapped_y = static_cast<float*>(TargetWrapperCL::Map(
               y_data, 0, sizeof(float) * x_dim.production()));
           for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<int>(i);
-            mapped_y[i] = static_cast<int>(0);
+            mapped_x[i] = static_cast<float>(i);
           }
 
           // set context and kernel args
@@ -113,7 +115,7 @@ TEST(layout, compute) {
 // result
 #ifdef PRINT_RESULT
           LOG(INFO) << "---- print result ----";
-          for (int eidx = 0; i < x_dim.production(); ++eidx) {
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
             std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
                       << std::endl;
           }
@@ -145,10 +147,144 @@ TEST(layout, compute) {
 #endif
 }
 
+TEST(layout_ImageNW, compute) {
+#ifdef LOOP_TEST
+  for (int n = 1; n <= 100; n += 21) {
+    for (auto c : {1, 3}) {
+      for (int h = 1; h <= 100; h += 13) {
+        for (int w = 1; w <= 100; w += 17) {
+#else
+          const int n = 1;
+          const int c = 1;
+          const int h = 1;
+          const int w = 100;
+#endif  // LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_nw_kernels =
+              KernelRegistry::Global().Create("layout_once",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageNW));
+          ASSERT_FALSE(buf_to_img_nw_kernels.empty());
+          auto buf_to_img_nw_kernel = std::move(buf_to_img_nw_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_nw_kernel->doc();
+
+          // set tensors about op param
+          operators::LayoutParam bufferToImageParam;
+          lite::Tensor x, y, cpu_y;
+          bufferToImageParam.x = &x;
+          bufferToImageParam.y = &y;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);  // useless for image2D
+          cpu_y.Resize(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+
+          // mute in buffer
+          auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          // mute out image nw
+          size_t image_width = w * ((n + 3) / 4);
+          size_t image_height = c * h;
+          auto* y_data =
+              y.mutable_data<float, cl::Image2D>(image_width, image_height);
+          auto* cpu_y_data =
+              cpu_y.mutable_data<float, cl::Image2D>(image_width, image_height);
+
+          auto* mapped_x = static_cast<float*>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+
+          const size_t cl_image2d_row_pitch{0};
+          const size_t cl_image2d_slice_pitch{0};
+
+          auto* mapped_y = static_cast<float*>(
+              TargetWrapperCL::MapImage(y_data,
+                                        image_width,
+                                        image_height,
+                                        cl_image2d_row_pitch,
+                                        cl_image2d_slice_pitch));
+
+          auto* mapped_cpu_y = static_cast<float*>(
+              TargetWrapperCL::MapImage(cpu_y_data,
+                                        image_width,
+                                        image_height,
+                                        cl_image2d_row_pitch,
+                                        cl_image2d_slice_pitch));
+
+          // random datas
+          std::default_random_engine engine;
+          std::uniform_real_distribution<float> gen(-5, 5);
+
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = gen(engine);
+          }
+
+          // gen cpu y_data
+          CLImageConverterNWBlock nw_converter;
+          nw_converter.NCHWToImage(mapped_x, mapped_cpu_y, x_dim);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          // set kernel params
+          buf_to_img_nw_kernel->SetParam(bufferToImageParam);
+
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+
+          // set context
+          buf_to_img_nw_kernel->SetContext(std::move(buf_to_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_nw_kernel->Launch();
+
+// result
+#ifdef PRINT_RESULT
+          LOG(INFO) << "---- print result ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // PRINT_RESULT
+
+          // check result: compare input and output
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(mapped_cpu_y[eidx], mapped_y[eidx], 1e-3);
+            if (abs(mapped_cpu_y[eidx] - mapped_y[eidx]) > 1e-3) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", mapped_x[" << eidx
+                        << "]:" << mapped_cpu_y[eidx] << ", mapped_y[" << eidx
+                        << "]:" << mapped_y[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+
+#ifdef LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(
-    layout, kOpenCL, kAny, kNHWC, buffer_chw_to_image2d_hwc_opencl_fp32);
-USE_LITE_KERNEL(
-    layout, kOpenCL, kAny, kNCHW, image2d_hwc_to_buffer_chw_opencl_fp32);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+USE_LITE_KERNEL(layout_once, kOpenCL, kFloat, kImageNW, NCHW_to_ImageNW);
diff --git a/lite/kernels/opencl/nearest_interp_compute.cc b/lite/kernels/opencl/nearest_interp_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22cbd8522f2d4212a8bf991825863503e5a27c46
--- /dev/null
+++ b/lite/kernels/opencl/nearest_interp_compute.cc
@@ -0,0 +1,239 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class NearestInterpComputeFloatImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::InterpolateParam;
+
+  std::string doc() const override {
+    return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFloat";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf = param.X->data<float, cl::Image2D>();
+    auto* out_buf =
+        param.Out->mutable_data<float, cl::Image2D>(param.out_w, param.out_h);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+    float scale_h = y_dims[2] / x_dims[2];
+    float scale_w = y_dims[3] / x_dims[3];
+    int in_dims_h = x_dims[2];
+    int out_dims_h = y_dims[2];
+    int in_dims_w = x_dims[3];
+    int out_dims_w = y_dims[3];
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
+    CL_CHECK_FATAL(status);
+
+    paddle::lite::CLImageConverterDefault default_convertor;
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dims);  // w, h
+    auto y_img_width = y_img_shape[0];
+    LOG(INFO) << "y_img_width:" << y_img_width;
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(y_img_width / y_dims[3]),
+                    static_cast<cl::size_type>(y_dims[3]),
+                    static_cast<cl::size_type>(y_dims[0] * y_dims[2])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"nearest_interp"};
+  std::string build_options_{"-DCL_DTYPE_float "};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class NearestInterpComputeFP16ImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::InterpolateParam;
+
+  std::string doc() const override {
+    return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf =
+        param.X->data<int16_t,
+                      cl::Image2D>();  // use int16_t represents half float
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf =
+        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
+                                                        // represents half float
+            image_shape["width"],
+            image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+    float scale_h = y_dims[2] / x_dims[2];
+    float scale_w = y_dims[3] / x_dims[3];
+    int in_dims_h = x_dims[2];
+    int out_dims_h = y_dims[2];
+    int in_dims_w = x_dims[3];
+    int out_dims_w = y_dims[3];
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"nearest_interp"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    nearest_interp,
+    kOpenCL,
+    kFloat,
+    kImageDefault,
+    paddle::lite::kernels::opencl::NearestInterpComputeFloatImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    nearest_interp,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::NearestInterpComputeFP16ImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/nearest_interp_compute_test.cc b/lite/kernels/opencl/nearest_interp_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc9c5893eea92684e72f472328d41bfc98ead9fa
--- /dev/null
+++ b/lite/kernels/opencl/nearest_interp_compute_test.cc
@@ -0,0 +1,285 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void nearest_interp_compute_ref(const dtype *src,
+                                int w_in,
+                                int h_in,
+                                dtype *dst,
+                                int w_out,
+                                int h_out,
+                                float scale_x,
+                                float scale_y,
+                                bool with_align = false) {
+  float scale_w_new = (with_align)
+                          ? (static_cast<float>(w_in - 1) / (w_out - 1))
+                          : (static_cast<float>(w_in) / (w_out));
+  float scale_h_new = (with_align)
+                          ? (static_cast<float>(h_in - 1) / (h_out - 1))
+                          : (static_cast<float>(h_in) / (h_out));
+  if (with_align) {
+    for (int h = 0; h < h_out; ++h) {
+      dtype *dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h + 0.5);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w + 0.5);
+        *dst_p++ = src[near_y * w_in + near_x];
+      }
+    }
+  } else {
+    for (int h = 0; h < h_out; ++h) {
+      dtype *dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w);
+        *dst_p++ = src[near_y * w_in + near_x];
+      }
+    }
+  }
+}
+// #define LOOP_TEST
+// #define PRINT_RESULT
+TEST(nearest_interp_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "
+               "nearest_interp(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef LOOP_TEST
+  for (int n : {1, 3}) {
+    for (auto c : {1, 3}) {
+      for (int h : {12, 20, 50, 112}) {
+        for (int w : {12, 20, 50, 112}) {
+          for (int out_h : {36, 60, 90, 224}) {
+            for (int out_w : {36, 60, 90, 224}) {
+              if (out_w < w || out_h < h) {
+                continue;
+              }
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+  const int out_h = 6;
+  const int out_w = 8;
+#endif  // LOOP_TEST
+
+              float scale_x = out_w / w;
+              float scale_y = out_h / h;
+
+              LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c
+                        << " " << h << " " << w << " ========" << out_h << " "
+                        << out_w;
+              // set layout kernels
+              auto buf_to_img_kernels =
+                  KernelRegistry::Global().Create("layout",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kAny),
+                                                  DATALAYOUT(kImageDefault));
+              auto img_to_buf_kernels =
+                  KernelRegistry::Global().Create("layout",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kAny),
+                                                  DATALAYOUT(kNCHW));
+              auto nearest_interp_img_kernels =
+                  KernelRegistry::Global().Create("nearest_interp",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(buf_to_img_kernels.empty());
+              ASSERT_FALSE(buf_to_img_kernels.empty());
+              ASSERT_FALSE(nearest_interp_img_kernels.empty());
+
+              auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+              auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+              auto nearest_interp_img_kernel =
+                  std::move(nearest_interp_img_kernels.front());
+              LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+              LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+              LOG(INFO) << "get 3rd kernel: "
+                        << nearest_interp_img_kernel->doc();
+
+              // set tensors about op param
+              LOG(INFO) << "set tensors about op param";
+              // layout(buf->img): x -> nearest_interp_in
+              // nearest_interp(img): nearest_interp_in -> nearest_interp_out
+              // layout(img->buf): nearest_interp_out -> y
+              lite::Tensor x, y, nearest_interp_in, nearest_interp_out, y_ref;
+              operators::LayoutParam BufferToImageParam;
+              operators::LayoutParam ImageToBufferParam;
+              BufferToImageParam.x = &x;
+              BufferToImageParam.y = &nearest_interp_in;
+              ImageToBufferParam.x = &nearest_interp_out;
+              ImageToBufferParam.y = &y;
+              operators::InterpolateParam NearestInterpParam;
+              NearestInterpParam.X = &nearest_interp_in;
+              NearestInterpParam.Out = &nearest_interp_out;
+              NearestInterpParam.out_h = out_h;
+              NearestInterpParam.out_w = out_w;
+
+              const DDim x_dim =
+                  DDim(std::vector<DDim::value_type>{n, c, h, w});
+              const DDim y_dim =
+                  DDim(std::vector<DDim::value_type>{n, c, out_h, out_w});
+              x.Resize(x_dim);
+              y.Resize(y_dim);
+              nearest_interp_in.Resize(x_dim);
+              nearest_interp_out.Resize(y_dim);
+              y_ref.Resize(y_dim);
+              auto nearest_interp_image2d_shape =
+                  paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+              // initialize tensors
+              LOG(INFO) << "initialize tensors";
+              auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+              auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+              auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+              auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+                  x_data, 0, sizeof(float) * x_dim.production()));
+              auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+                  y_data, 0, sizeof(float) * y_dim.production()));
+              for (int i = 0; i < x_dim.production(); ++i) {
+                mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+              }
+              for (int i = 0; i < y_dim.production(); ++i) {
+                mapped_y[i] = static_cast<int>(0);
+              }
+              auto *nearest_interp_in_data =
+                  nearest_interp_in.mutable_data<float, cl::Image2D>(
+                      nearest_interp_image2d_shape["width"],
+                      nearest_interp_image2d_shape["height"]);
+              auto *nearest_interp_out_data =
+                  nearest_interp_out.mutable_data<float, cl::Image2D>(y_dim[3],
+                                                                      y_dim[2]);
+
+              // set context and kernel args
+              LOG(INFO) << "set context and kernel args";
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+
+              buf_to_img_kernel->SetParam(BufferToImageParam);
+              std::unique_ptr<KernelContext> buf_to_img_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(buf_to_img_context->As<OpenCLContext>()));
+              buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+              img_to_buf_kernel->SetParam(ImageToBufferParam);
+              std::unique_ptr<KernelContext> img_to_buf_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(img_to_buf_context->As<OpenCLContext>()));
+              img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+              nearest_interp_img_kernel->SetParam(NearestInterpParam);
+              std::unique_ptr<KernelContext> nearest_interp_img_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(nearest_interp_img_context->As<OpenCLContext>()));
+              nearest_interp_img_kernel->SetContext(
+                  std::move(nearest_interp_img_context));
+
+              // run kernels
+              LOG(INFO) << "run kernel: buf_to_img_kernel";
+              buf_to_img_kernel->Launch();
+              LOG(INFO) << "run kernel: nearest_interp_img_kernel";
+              nearest_interp_img_kernel->Launch();
+              LOG(INFO) << "run kernel: img_to_buf_kernel";
+              img_to_buf_kernel->Launch();
+
+              // compute ref cpu
+              for (int nid = 0; nid < x_dim[0]; ++nid) {
+                for (int cid = 0; cid < x_dim[1]; ++cid) {
+                  float *x_nc =
+                      mapped_x + (nid * x_dim[1] + cid) * x_dim[3] * x_dim[2];
+                  float *y_nc =
+                      y_data_ref + (nid * x_dim[1] + cid) * y_dim[3] * y_dim[2];
+                  nearest_interp_compute_ref<float>(x_nc,
+                                                    x_dim[3],
+                                                    x_dim[2],
+                                                    y_nc,
+                                                    y_dim[3],
+                                                    y_dim[2],
+                                                    1 / scale_x,
+                                                    1 / scale_y);
+                }
+              }
+// result
+#ifdef PRINT_RESULT
+              LOG(INFO) << "---- print kernel result (input -> output) ----";
+              for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+                std::cout << mapped_x[eidx] << " ";
+              }
+              std::cout << std::endl;
+              for (int eidx = 0; eidx < y_dim.production(); ++eidx) {
+                std::cout << mapped_y[eidx] << " ";
+              }
+              std::cout << std::endl;
+              for (int eidx = 0; eidx < y_dim.production(); ++eidx) {
+                std::cout << y_data_ref[eidx] << " ";
+              }
+              std::cout << std::endl;
+#endif  // PRINT_RESULT
+
+              // check result: compare kernel output and cpu output(y_data_ref)
+              for (int eidx = 0; eidx < y_dim.production(); eidx++) {
+                EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+                if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+                  LOG(FATAL) << "1st diff in this case at eidx[from 0]:" << eidx
+                             << " / " << x_dim.production() << ", y_data_ref["
+                             << eidx << "]:" << y_data_ref[eidx]
+                             << ", mapped_y[" << eidx << "]:" << mapped_y[eidx];
+                  break;
+                }
+              }
+
+              // free
+              LOG(INFO) << "free: unmap x, y";
+              TargetWrapperCL::Unmap(x_data, mapped_x);
+              TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+            }
+          }
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+// nearest_interp buffer
+// USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kNCHW, def);
+
+// nearest_interp image2d fp32
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kImageDefault, ImageDefault);
+
+// nearest_interp image2d fp16
+USE_LITE_KERNEL(nearest_interp, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/pool_compute.cc b/lite/kernels/opencl/pool_compute.cc
index dc2e851595b08e1ff401499502fab64df4dfa46f..c0a00e87b8ad67ba0028ff4fa57f0811d52c1f0a 100644
--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
@@ -30,6 +31,8 @@ class PoolCompute
  public:
   using param_t = operators::PoolParam;
 
+  std::string doc() const override { return "Pool using cl::Buffer, kFloat"; }
+
   void PrepareForRun() override {
     const auto& param = *param_.get_mutable<param_t>();
     kernel_func_name_ += param.pooling_type;
@@ -44,16 +47,22 @@ class PoolCompute
     const auto& out_dims = param.output->dims();
     const std::string pooling_type = param.pooling_type;
     const bool global_pooling = param.global_pooling;
-    std::vector<int> paddings = param.paddings;
+    std::vector<int> paddings = *param.paddings;
     std::vector<int> strides = param.strides;
     std::vector<int> ksize = param.ksize;
     if (global_pooling) {
       for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
+        paddings[2 * i] = 0;
+        paddings[2 * i + 1] = 0;
         ksize[i] = static_cast<int>(in_dims[i + 2]);
       }
     }
-
+    bool pads_equal =
+        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+    if (!pads_equal) {
+      LOG(FATAL)
+          << "padding requires pad_left == pad_right, pad_top == pad_bottom";
+    }
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     auto* input_buf = param.x->data<float, cl::Buffer>();
@@ -89,7 +98,7 @@ class PoolCompute
     CL_CHECK_FATAL(status);
     status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[1]));
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
     CL_CHECK_FATAL(status);
     status = kernel.setArg(++arg_idx, *output_buf);
     CL_CHECK_FATAL(status);
@@ -107,7 +116,111 @@ class PoolCompute
 
  private:
   std::string kernel_func_name_{"pool_"};
-  std::string build_options_{"-DCL_DTYPE=float"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
+                                             PRECISION(kFloat),
+                                             DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::PoolParam;
+
+  std::string doc() const override { return "Pool using cl::Image2D, kFloat"; }
+
+  void PrepareForRun() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    kernel_func_name_ += param.pooling_type;
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/pool_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    const auto& in_dims = param.x->dims();
+    const auto& out_dims = param.output->dims();
+    const std::string pooling_type = param.pooling_type;
+    const bool global_pooling = param.global_pooling;
+    std::vector<int> paddings = *param.paddings;
+    std::vector<int> strides = param.strides;
+    std::vector<int> ksize = param.ksize;
+    if (global_pooling) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[2 * i] = 0;
+        paddings[2 * i + 1] = 0;
+        ksize[i] = static_cast<int>(in_dims[i + 2]);
+      }
+    }
+    bool pads_equal =
+        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+    if (!pads_equal) {
+      LOG(FATAL)
+          << "padding requires pad_left == pad_right, pad_top == pad_bottom";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto* x_img = param.x->data<float, cl::Image2D>();
+    LOG(INFO) << "x_image" << x_img;
+
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
+              << out_image_shape["height"];
+    auto* out_img = param.output->mutable_data<float, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+    LOG(INFO) << "out_image" << out_img;
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int c_block = (out_dims[1] + 3) / 4;
+    int w = out_dims[3];
+    int nh = out_dims[0] * out_dims[2];
+    auto global_work_size = cl::NDRange(c_block, w, nh);
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"pool_"};
+  std::string build_options_{"-DCL_DTYPE_float"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
@@ -116,12 +229,28 @@ class PoolCompute
 }  // namespace lite
 }  // namespace paddle
 
+// REGISTER_LITE_KERNEL(pool2d,
+//                      kOpenCL,
+//                      kFloat,
+//                      kNCHW,
+//                      paddle::lite::kernels::opencl::PoolCompute,
+//                      def)
+//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .Finalize();
+
 REGISTER_LITE_KERNEL(pool2d,
                      kOpenCL,
                      kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::opencl::PoolCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::PoolComputeImage2D,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/pool_compute_test.cc b/lite/kernels/opencl/pool_compute_test.cc
index 53f64e950500425655fbd450d5961a2a8dbc412d..133fc619205c0fcd0fdfcd1203796e1e74e0c4e0 100644
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+#include <memory>
 #include <random>
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
@@ -72,15 +73,16 @@ void pool_avg(const int padding_height,
   }
 }
 
-TEST(pool2d, compute) {
+// buffer
+#if 0   // pool_buffer
+TEST(pool2d_buffer_fp32, compute) {
   LOG(INFO) << "to get kernel ...";
   auto kernels = KernelRegistry::Global().Create(
       "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
   ASSERT_FALSE(kernels.empty());
 
   auto kernel = std::move(kernels.front());
-
-  LOG(INFO) << "get kernel";
+  LOG(INFO) << "get kernel:" << kernel->doc();
 
   lite::Tensor x, out;
   operators::PoolParam param;
@@ -88,9 +90,10 @@ TEST(pool2d, compute) {
   param.output = &out;
   param.global_pooling = true;
   param.pooling_type = "avg";
-  param.paddings = std::vector<int>{0, 0};
+  std::vector<int> paddings = {0, 0, 0, 0};
   param.strides = std::vector<int>{1, 1};
   param.ksize = std::vector<int>{7, 7};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
 
   std::unique_ptr<KernelContext> context(new KernelContext);
   context->As<OpenCLContext>().InitOnce();
@@ -140,8 +143,104 @@ TEST(pool2d, compute) {
   }
   TargetWrapperCL::Unmap(out_data, mapped_out);
 }
+#endif  // pool_buffer
+
+TEST(pool2d_image2d_fp32, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel:" << kernel->doc();
+
+  lite::Tensor x, out;
+  operators::PoolParam param;
+  param.x = &x;
+  param.output = &out;
+  param.global_pooling = false;
+  param.pooling_type = "avg";
+  std::vector<int> paddings = {0, 0, 0, 0};
+  param.strides = std::vector<int>{1, 1};
+  param.ksize = std::vector<int>{7, 7};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> pool_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(pool_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(pool_context));
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 11, 107, 107});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 11, 101, 101});
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+  std::vector<float> input_v(4 * 11 * 107 * 107);
+  for (auto& i : input_v) {
+    i = dist(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+            << x_image_shape[1];
+  std::vector<float> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<float, cl::Image2D>(
+      x_image_shape[0], x_image_shape[1], x_image_data.data());
+  LOG(INFO) << "x_image:" << x_image;
+
+  DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = out.mutable_data<float, cl::Image2D>(out_image_shape[0],
+                                                         out_image_shape[1]);
+  LOG(INFO) << "out_image:" << out_image;
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+  pool_avg(0, 0, 1, 1, 7, 7, input_v.data(), in_dim, out_ref.get(), out_dim);
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  float* out_image_data = new float[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, out_image_shape, out_dim);
+
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref[i], 1e-6);
+  }
+}
 
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/relu_compute.cc b/lite/kernels/opencl/relu_compute.cc
index c7b89c939b0bf571f27ac1dfdd272a9324f8e89f..f1c78cb17c7aac62c9549ee427c218568840f19d 100644
--- a/lite/kernels/opencl/relu_compute.cc
+++ b/lite/kernels/opencl/relu_compute.cc
@@ -29,6 +29,7 @@ class ReluCompute
  public:
   using param_t = operators::ActivationParam;
 
+  std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
@@ -72,15 +73,21 @@ class ReluCompute
 
  private:
   std::string kernel_func_name_{"relu"};
-  std::string build_options_{"-DCL_DTYPE=float -DRELU"};
+  std::string build_options_{"-DCL_DTYPE_float -DRELU"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
-class ReluComputeFloatImage
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+class ReluComputeFloatImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
  public:
   using param_t = operators::ActivationParam;
 
+  std::string doc() const override {
+    return "Relu using cl::Image2D(ImageDefault/RGBA), kFloat";
+  }
+
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
@@ -139,12 +146,232 @@ class ReluComputeFloatImage
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
+class ReluComputeFP16ImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Relu using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/relu_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf =
+        param.X->data<int16_t,
+                      cl::Image2D>();  // use int16_t represents half float
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf =
+        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
+                                                        // represents half float
+            image_shape["width"],
+            image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"relu"};
+  std::string build_options_{"-DCL_DTYPE_half -DRELU"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class Relu6ComputeFloatImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFloat";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/relu6_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf = param.X->data<float, cl::Image2D>();
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+    auto threshold = param.Relu_clipped_coef;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, threshold);
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+    VLOG(4) << "threshold:" << threshold;
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"relu6"};
+  std::string build_options_{"-DCL_DTYPE_float -DRELU6"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class Relu6ComputeFP16ImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/relu6_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf = param.X->data<int16_t, cl::Image2D>();
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf = param.Out->mutable_data<int16_t, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+    auto threshold = param.Relu_clipped_coef;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, threshold);
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+    VLOG(4) << "threshold:" << threshold;
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"relu6"};
+  std::string build_options_{"-DCL_DTYPE_half -DRELU6"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-// REGISTER_LITE_KERNEL(relu,
+// REGISTER_LITE_KERNEL(relu,`
 //                     kOpenCL,
 //                     kFloat,
 //                     kNCHW,
@@ -154,18 +381,70 @@ class ReluComputeFloatImage
 //    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
 //    .Finalize();
 
+REGISTER_LITE_KERNEL(
+    relu,
+    kOpenCL,
+    kFloat,
+    kImageDefault,
+    paddle::lite::kernels::opencl::ReluComputeFloatImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(relu,
                      kOpenCL,
-                     kFloat,
-                     kNHWC,
-                     paddle::lite::kernels::opencl::ReluComputeFloatImage,
-                     image2d)
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ReluComputeFP16ImageDefault,
+                     ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+// Relu6
+REGISTER_LITE_KERNEL(
+    relu6,
+    kOpenCL,
+    kFloat,
+    kImageDefault,
+    paddle::lite::kernels::opencl::Relu6ComputeFloatImageDefault,
+    ImageDefault)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC))})
+                                      DATALAYOUT(kImageDefault))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kFloat),
-                                       DATALAYOUT(kNHWC))})
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    relu6,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::Relu6ComputeFP16ImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/relu_compute_test.cc b/lite/kernels/opencl/relu_compute_test.cc
index d2f0812bae608324a8ab31756981271fe1c334e4..cda214ceaf83553f6922e5f0b6a0e97de401c3ae 100644
--- a/lite/kernels/opencl/relu_compute_test.cc
+++ b/lite/kernels/opencl/relu_compute_test.cc
@@ -17,18 +17,32 @@
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
 
 namespace paddle {
 namespace lite {
 
 template <typename dtype>
-void relu_compute_ref(const dtype *x_data, const DDim &x_dim, dtype *out_data) {
-  for (int i = 0; i < x_dim.production(); ++i) {
-    out_data[i] = x_data[i] > 0.f ? x_data[i] : 0.f;
+void relu_compute_ref(const dtype *x_data,
+                      const DDim &x_dim,
+                      dtype *out_data,
+                      float threshold = 0.f) {
+  if (abs(threshold) < 1e-5) {
+    // relu
+    for (int i = 0; i < x_dim.production(); ++i) {
+      out_data[i] = (x_data[i] > threshold) ? x_data[i] : threshold;
+    }
+  } else {
+    // relu6 or relu with threshold
+    for (int i = 0; i < x_dim.production(); ++i) {
+      auto out_tmp = (x_data[i] > 0) ? x_data[i] : 0;
+      out_data[i] = (out_tmp < threshold) ? out_tmp : threshold;
+    }
   }
 }
 
-TEST(opencl_relu, compute) {
+#if 0   // relu_buffer
+TEST(opencl_relu_buffer, compute) {
   // prepare data
   const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
   lite::Tensor x, out;
@@ -87,8 +101,646 @@ TEST(opencl_relu, compute) {
   TargetWrapperCL::Unmap(out_data, mapped_out);
   TargetWrapperCL::Unmap(x_data, mapped_x);
 }
+#endif  // relu_buffer
+
+// #define LOOP_TEST
+// #define PRINT_RESULT
+TEST(relu_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+#endif  // LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          auto relu_img_kernels =
+              KernelRegistry::Global().Create("relu",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(relu_img_kernels.empty());
+
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          auto relu_img_kernel = std::move(relu_img_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          // layout(buf->img): x -> relu_in
+          // relu(img): relu_in -> relu_out
+          // layout(img->buf): relu_out -> y
+          lite::Tensor x, y, relu_in, relu_out, y_ref;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &relu_in;
+          ImageToBufferParam.x = &relu_out;
+          ImageToBufferParam.y = &y;
+          operators::ActivationParam ReluParam;
+          ReluParam.X = &relu_in;
+          ReluParam.Out = &relu_out;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);
+          relu_in.Resize(x_dim);
+          relu_out.Resize(x_dim);
+          y_ref.Resize(x_dim);
+          auto relu_image2d_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(float) * x_dim.production()));
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+            mapped_y[i] = static_cast<int>(0);
+          }
+          auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+          auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          relu_img_kernel->SetParam(ReluParam);
+          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(relu_img_context->As<OpenCLContext>()));
+          relu_img_kernel->SetContext(std::move(relu_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: relu_img_kernel";
+          relu_img_kernel->Launch();
+          LOG(INFO) << "run kernel: img_to_buf_kernel";
+          img_to_buf_kernel->Launch();
+
+          // compute ref cpu
+          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
+// result
+#ifdef PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // PRINT_RESULT
+
+          // check result: compare kernel output and cpu output(y_data_ref)
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", y_data_ref["
+                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                        << eidx << "]:" << mapped_y[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+TEST(relu_image2d_fp16, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef RELU_FP16_LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+#endif  // RELU_FP16_LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          auto relu_img_kernels =
+              KernelRegistry::Global().Create("relu",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFP16),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(relu_img_kernels.empty());
+
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          auto relu_img_kernel = std::move(relu_img_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          // layout(buf->img): x -> relu_in
+          // relu(img): relu_in -> relu_out
+          // layout(img->buf): relu_out -> y
+          lite::Tensor x, y, relu_in, relu_out, y_ref;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &relu_in;
+          ImageToBufferParam.x = &relu_out;
+          ImageToBufferParam.y = &y;
+          operators::ActivationParam ReluParam;
+          ReluParam.X = &relu_in;
+          ReluParam.Out = &relu_out;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);
+          relu_in.Resize(x_dim);
+          relu_out.Resize(x_dim);
+          y_ref.Resize(x_dim);
+          auto relu_image2d_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(float) * x_dim.production()));
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+            mapped_y[i] = static_cast<int>(0);
+          }
+          auto *relu_in_data = relu_in.mutable_data<int16_t, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+          auto *relu_out_data = relu_out.mutable_data<int16_t, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          relu_img_kernel->SetParam(ReluParam);
+          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(relu_img_context->As<OpenCLContext>()));
+          relu_img_kernel->SetContext(std::move(relu_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: relu_img_kernel";
+          relu_img_kernel->Launch();
+          LOG(INFO) << "run kernel: img_to_buf_kernel";
+          img_to_buf_kernel->Launch();
+
+          // compute ref cpu
+          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
+// result
+#ifdef RELU_FP16_PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // RELU_FP16_PRINT_RESULT
+
+          // check result: compare kernel output and cpu output(y_data_ref)
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", y_data_ref["
+                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                        << eidx << "]:" << mapped_y[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef RELU_FP16_LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+// #define RELU6_FP32_LOOP_TEST
+// #define RELU6_FP32_PRINT_RESULT
+TEST(relu6_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef RELU6_FP32_LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+#endif  // RELU6_FP32_LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          auto relu_img_kernels =
+              KernelRegistry::Global().Create("relu6",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(relu_img_kernels.empty());
+
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          auto relu_img_kernel = std::move(relu_img_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          // layout(buf->img): x -> relu_in
+          // relu(img): relu_in -> relu_out
+          // layout(img->buf): relu_out -> y
+          lite::Tensor x, y, relu_in, relu_out, y_ref;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &relu_in;
+          ImageToBufferParam.x = &relu_out;
+          ImageToBufferParam.y = &y;
+          operators::ActivationParam ReluParam;
+          ReluParam.X = &relu_in;
+          ReluParam.Out = &relu_out;
+          ReluParam.Relu_clipped_coef = 6.f;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);
+          relu_in.Resize(x_dim);
+          relu_out.Resize(x_dim);
+          y_ref.Resize(x_dim);
+          auto relu_image2d_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(float) * x_dim.production()));
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+            mapped_y[i] = static_cast<int>(0);
+          }
+          auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+          auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          relu_img_kernel->SetParam(ReluParam);
+          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(relu_img_context->As<OpenCLContext>()));
+          relu_img_kernel->SetContext(std::move(relu_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: relu_img_kernel";
+          relu_img_kernel->Launch();
+          LOG(INFO) << "run kernel: img_to_buf_kernel";
+          img_to_buf_kernel->Launch();
+
+          // compute ref cpu
+          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
+// result
+#ifdef RELU6_FP32_PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // RELU6_FP32_PRINT_RESULT
+
+          // check result: compare kernel output and cpu output(y_data_ref)
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", y_data_ref["
+                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                        << eidx << "]:" << mapped_y[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef RELU6_FP32_LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+// #define RELU6_FP16_LOOP_TEST
+// #define RELU6_FP16_PRINT_RESULT
+TEST(relu6_image2d_fp16, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef RELU6_FP16_LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+#endif  // RELU6_FP16_LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          auto relu_img_kernels =
+              KernelRegistry::Global().Create("relu6",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(relu_img_kernels.empty());
+
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          auto relu_img_kernel = std::move(relu_img_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          // layout(buf->img): x -> relu_in
+          // relu(img): relu_in -> relu_out
+          // layout(img->buf): relu_out -> y
+          lite::Tensor x, y, relu_in, relu_out, y_ref;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &relu_in;
+          ImageToBufferParam.x = &relu_out;
+          ImageToBufferParam.y = &y;
+          operators::ActivationParam ReluParam;
+          ReluParam.X = &relu_in;
+          ReluParam.Out = &relu_out;
+          ReluParam.Relu_clipped_coef = 6.f;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);
+          relu_in.Resize(x_dim);
+          relu_out.Resize(x_dim);
+          y_ref.Resize(x_dim);
+          auto relu_image2d_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(float) * x_dim.production()));
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+            mapped_y[i] = static_cast<int>(0);
+          }
+          auto *relu_in_data = relu_in.mutable_data<int16_t, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+          auto *relu_out_data = relu_out.mutable_data<int16_t, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          relu_img_kernel->SetParam(ReluParam);
+          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(relu_img_context->As<OpenCLContext>()));
+          relu_img_kernel->SetContext(std::move(relu_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: relu_img_kernel";
+          relu_img_kernel->Launch();
+          LOG(INFO) << "run kernel: img_to_buf_kernel";
+          img_to_buf_kernel->Launch();
+
+          // compute ref cpu
+          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
+// result
+#ifdef RELU6_FP16_PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // RELU6_FP16_PRINT_RESULT
+
+          // check result: compare kernel output and cpu output(y_data_ref)
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", y_data_ref["
+                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                        << eidx << "]:" << mapped_y[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef RELU6_FP16_LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
 
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
+// relu buffer
+// USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
+
+// relu image2d fp32
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+USE_LITE_KERNEL(relu, kOpenCL, kFloat, kImageDefault, ImageDefault);
+
+// relu image2d fp16
+USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
+
+// relu6 image2d fp32
+USE_LITE_KERNEL(relu6, kOpenCL, kFloat, kImageDefault, ImageDefault);
+USE_LITE_KERNEL(relu6, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/reshape_compute.cc b/lite/kernels/opencl/reshape_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7af648c5601e0a516eb92b3090cb8d7e836a5447
--- /dev/null
+++ b/lite/kernels/opencl/reshape_compute.cc
@@ -0,0 +1,206 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+// reshape operator
+class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
+                                                   PRECISION(kFloat),
+                                                   DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ReshapeParam;
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/reshape_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    VLOG(4) << "reshape_compute run ... ";
+
+    auto& param = *param_.get_mutable<param_t>();
+    const Tensor* const x = param.x;
+
+    const auto x_dims = x->dims();
+    const std::map<std::string, size_t>& input_image_shape =
+        InitImageDimInfoWith(x_dims);
+
+    const int64_t& input_image_width = input_image_shape.at("width");
+    const int64_t& input_image_height = input_image_shape.at("height");
+
+    const cl::Image2D* const x_image = x->data<float, cl::Image2D>();
+
+    const std::vector<int>& shape_vct = param.shape_vct;
+    Tensor* const output = param.output;
+    const DDimLite& out_dims = output->dims();
+    VLOG(4) << "out_dims= " << out_dims;
+
+    const std::map<std::string, size_t>& out_image_shape =
+        InitImageDimInfoWith(out_dims);
+    cl::Image2D* const out_image = output->mutable_data<float, cl::Image2D>(
+        out_image_shape.at("width"), out_image_shape.at("height"));
+    LOG(INFO) << "out_dims=   " << out_dims;
+
+    const std::vector<size_t>& default_work_size = DefaultWorkSize(
+        out_dims,
+        DDim(std::vector<DDim::value_type>{
+            static_cast<int64_t>(out_image_shape.at("width")),
+            static_cast<int64_t>(out_image_shape.at("height"))}));
+
+    int x_v_dims[4] = {1, 1, 1, 1};
+    int out_v_dims[4] = {1, 1, 1, 1};
+    // 1 1000 1 1
+    for (int i = 0; i < x_dims.size(); i++) {
+      x_v_dims[4 - x_dims.size() + i] = x_dims[i];
+    }
+    // 1 1 1 1000
+    for (int i = 0; i < out_dims.size(); i++) {
+      out_v_dims[4 - out_dims.size() + i] = out_dims[i];
+    }
+
+    int out_C = out_v_dims[1];
+    int out_H = out_v_dims[2];
+    int out_W = out_v_dims[3];
+    int in_W = x_v_dims[3];
+    int in_H = x_v_dims[2];
+    int in_Stride0 = in_W;
+    int in_Stride1 = x_v_dims[2] * x_v_dims[3];
+    int in_Stride2 = x_v_dims[1] * x_v_dims[2] * x_v_dims[3];
+    int out_Stride0 = out_W;
+    int out_Stride1 = out_H * out_W;
+    int out_Stride2 = out_C * out_H * out_W;
+    VLOG(4) << "out_C=" << out_C;
+    VLOG(4) << "out_H=" << out_H;
+    VLOG(4) << "out_W=" << out_W;
+    VLOG(4) << "in_W=" << in_W;
+    VLOG(4) << "default_work_size= " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+    VLOG(4) << "in_Stride0=" << in_Stride0;
+    VLOG(4) << "in_Stride1=" << in_Stride1;
+    VLOG(4) << "out_Stride0=" << out_Stride0;
+    VLOG(4) << "out_Stride1=" << out_Stride1;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << TargetToStr(x->target());
+    VLOG(4) << TargetToStr(param.output->target());
+
+    int arg_idx = 0;
+
+    cl_int status;
+    status = kernel.setArg(arg_idx, *x_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_C);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_H);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_W);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_W);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_H);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_Stride0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_Stride1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_Stride2);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_Stride0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_Stride1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_Stride2);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                    static_cast<size_t>(default_work_size.data()[1]),
+                    static_cast<size_t>(default_work_size.data()[2])};
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_image, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"reshape"};
+  std::string build_options_{"-DCL_DTYPE_float "};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(reshape,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(reshape2,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/reshape_compute_test.cc b/lite/kernels/opencl/reshape_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5ba1c118e7fa952fe1172080ee97555a82c7260
--- /dev/null
+++ b/lite/kernels/opencl/reshape_compute_test.cc
@@ -0,0 +1,227 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/operators/reshape_op.h"
+#include "lite/utils/logging.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+static DDim ValidateShape(const std::vector<int>& shape,
+                          const DDim& input_dims) {
+  const lite::DDim::value_type input_size = input_dims.production();
+  auto input_shape = input_dims.Vectorize();
+  bool all_positive = std::all_of(
+      input_shape.cbegin(), input_shape.cend(), [](lite::DDim::value_type i) {
+        return i > 0;
+      });
+  // only one dimension can be set to -1, whose size will be automatically
+  // infered.
+  const int unk_dim_val = -1;
+  const int copy_dim_val = 0;
+
+  std::vector<lite::DDim::value_type> output_shape(shape.size(), 0);
+  lite::DDim::value_type capacity = 1;
+  int unk_dim_idx = -1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (shape[i] == unk_dim_val) {
+      CHECK_EQ(unk_dim_idx, -1)
+          << "Only one input dimension of Attr(shape) can be unknown.";
+      unk_dim_idx = i;
+    } else if (shape[i] == copy_dim_val) {
+      CHECK_LT(static_cast<int>(i), input_shape.size())
+          << "The index of dimension to copy from input shape must be less "
+             "than the size of input shape.";
+    } else {
+      CHECK_GT(shape[i], 0) << "Each input dimension of Attr(shape) must not "
+                               "be negtive except one unknown dimension.";
+    }
+
+    capacity *= (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
+                          : input_shape[i]);
+    output_shape[i] = (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
+                                : input_shape[i]);
+  }
+
+  if (unk_dim_idx != -1) {
+    if (all_positive) {
+      // input_size < 0 and is un-determinate in compile time, skip the check,
+      // for example, input_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
+      // capacity = -24, input_size = -8, output_shape[0] = 0
+      // the following check will fail.
+      output_shape[unk_dim_idx] = -input_size / capacity;
+      CHECK_EQ(output_shape[unk_dim_idx] * capacity, -input_size)
+          << "Invalid shape is given.";
+    } else {
+      output_shape[unk_dim_idx] = -1;
+    }
+  } else {
+    CHECK_EQ(capacity, input_size) << "Invalid shape is given.";
+  }
+  return lite::DDim(output_shape);
+}
+
+TEST(reshape_opencl, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "reshape", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "created reshape kernel";
+
+  LOG(INFO) << "prepare kernel ------";
+
+  int64_t batch_size = 1;
+  int64_t ic = 2;
+  int64_t ih = 4;
+  int64_t iw = 6;
+
+  lite::Tensor input, output;
+
+  operators::ReshapeParam param;
+
+  Tensor shape_tensor;
+  shape_tensor.Resize({2});
+  auto* shape_tensor_data = shape_tensor.mutable_data<int>();
+  shape_tensor_data[0] = 6;
+  shape_tensor_data[1] = 8;
+
+  param.x = &input;
+  param.shape_tensor = &shape_tensor;  // use shape_tensor
+  param.inplace = false;
+  param.output = &output;
+
+  const DDim input_dim =
+      lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+  input.Resize(input_dim);
+
+  std::vector<int> final_shape = std::vector<int>(
+      shape_tensor_data, shape_tensor_data + shape_tensor.numel());
+
+  auto output_dim = ValidateShape(final_shape, input_dim);
+  param.output->Resize(output_dim);
+  LOG(INFO) << " output_dim------" << output_dim;
+
+  LOG(INFO) << "prepare kernel SetParam------";
+  kernel->SetParam(param);
+
+  size_t input_image_width = iw * ((ic + 3) / 4);
+  size_t input_image_height = ih * batch_size;
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  //  LOG(INFO) << "map input ...";
+  //  auto* mapped_input =
+  //      static_cast<float*>(TargetWrapperCL::MapImage(input_data,
+  //                                                    input_image_width,
+  //                                                    input_image_height,
+  //                                                    cl_image2d_row_pitch,
+  //                                                    cl_image2d_slice_pitch));
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> gen(-5, 5);
+  std::vector<float> input_v(batch_size * ic * ih * iw);
+
+  LOG(INFO) << "gen input ...";
+
+  float* input_v_data = &input_v[0];
+  for (auto& i : input_v) {
+    i = gen(engine);
+  }
+  paddle::lite::CLImageConverterDefault default_convertor;
+
+  std::vector<float> x_image_data(input_image_width * input_image_height *
+                                  4);  // 4 : RGBA
+
+  LOG(INFO) << "set mapped input  ...";
+  default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim);
+
+  auto* input_image = input.mutable_data<float, cl::Image2D>(
+      input_image_width, input_image_height, x_image_data.data());
+
+  LOG(INFO) << "prepare kernel ready";
+
+  LOG(INFO) << "mutable output ...";
+  CLImageConverterDefault default_converter;
+  DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = output.mutable_data<float, cl::Image2D>(out_image_shape[0],
+                                                            out_image_shape[1]);
+  VLOG(4) << "out_dims= " << output_dim;
+
+  LOG(INFO) << "kernel context ...";
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  std::unique_ptr<KernelContext> reshape_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(reshape_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(reshape_context));
+
+  LOG(INFO) << "kernel launch ...";
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto it = wait_list->find(out_image);
+
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  float* out_image_data = new float[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              output.data<float, cl::Image2D>(),
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter.ImageToNCHW(
+      out_image_data, out_data, out_image_shape, output_dim);
+  // check output dims
+  for (int i = 0; i < output.dims().size(); i++) {
+    CHECK_EQ(output.dims()[i], shape_tensor_data[i]);
+  }
+
+  // check output data
+  for (int i = 0; i < output.numel(); i++) {
+    EXPECT_NEAR(out_data[i], input_v_data[i], 1e-3);
+    if (abs(out_data[i] - input_v_data[i]) > 1e-3) {
+      LOG(INFO) << "error idx:" << i;
+    }
+  }
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(reshape, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(reshape2, kOpenCL, kFloat, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/scale_compute.cc b/lite/kernels/opencl/scale_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a7d4d4f61d452bb6193277766ecf94fd6034c6b
--- /dev/null
+++ b/lite/kernels/opencl/scale_compute.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ScaleParam;
+
+  std::string doc() const override { return "Scale using cl::Image2D, kFloat"; }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/scale_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    const auto& in_dims = param.x->dims();
+    auto* x_img = param.x->data<float, cl::Image2D>();
+    const float scale = param.scale;
+    const float bias = param.bias;
+
+    LOG(INFO) << "x_image" << x_img;
+    auto out_image_shape = InitImageDimInfoWith(in_dims);
+    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
+              << out_image_shape["height"];
+    auto* out_img = param.output->mutable_data<float, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+    LOG(INFO) << "out_image" << out_img;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
+                    static_cast<cl::size_type>(out_image_shape["height"])};
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, scale);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, bias);
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"scale"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(scale,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ScaleComputeImage2D,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/scale_compute_test.cc b/lite/kernels/opencl/scale_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72381fee4f62e029172286fd70aae9fcd6380515
--- /dev/null
+++ b/lite/kernels/opencl/scale_compute_test.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+void scale(const float* input_data,
+           const DDim& in_dim,
+           float* output_data,
+           const float scale,
+           const float bias) {
+  for (int i = 0; i < in_dim.production(); i++) {
+    output_data[i] = input_data[i] * scale + bias;
+  }
+}
+
+TEST(scale_image2d_fp32, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "scale", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel:" << kernel->doc();
+
+  lite::Tensor x, out;
+  operators::ScaleParam param;
+  param.x = &x;
+  param.output = &out;
+  param.scale = 1.5f;
+  param.bias = 0.3f;
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> scale_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(scale_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(scale_context));
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 11, 107, 107});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 11, 107, 107});
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+  std::vector<float> input_v(4 * 11 * 107 * 107);
+  for (auto& i : input_v) {
+    i = dist(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "image_shape = " << image_shape[0] << " " << image_shape[1];
+  std::vector<float> x_image_data(image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<float, cl::Image2D>(
+      image_shape[0], image_shape[1], x_image_data.data());
+  LOG(INFO) << "x_image:" << x_image;
+
+  auto* out_image =
+      out.mutable_data<float, cl::Image2D>(image_shape[0], image_shape[1]);
+  LOG(INFO) << "out_image:" << out_image;
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+  scale(input_v.data(), in_dim, out_ref.get(), 1.5f, 0.3f);
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  float* out_image_data = new float[image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              image_shape[0],
+                              image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, image_shape, out_dim);
+
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref[i], 1e-6);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(scale, kOpenCL, kFloat, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/sigmoid_compute.cc b/lite/kernels/opencl/sigmoid_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2dae9d1ae70c0cc7e0bcd6781061c1f3fc7d927b
--- /dev/null
+++ b/lite/kernels/opencl/sigmoid_compute.cc
@@ -0,0 +1,272 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class SigmoidCompute
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Sigmoid using cl::Buffer, kFloat";
+  }
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    size_t count = x_dims.production();
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* x_buf = param.X->data<float, cl::Buffer>();
+    auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, (const int)count);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size = cl::NDRange{count};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_buf, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"sigmoid"};
+  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class SigmoidComputeFloatImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFloat";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/sigmoid_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf = param.X->data<float, cl::Image2D>();
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"sigmoid"};
+  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class SigmoidComputeFP16ImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/sigmoid_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf =
+        param.X->data<int16_t,
+                      cl::Image2D>();  // use int16_t represents half float
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf =
+        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
+                                                        // represents half float
+            image_shape["width"],
+            image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"sigmoid"};
+  std::string build_options_{"-DCL_DTYPE_half -DSIGMOID"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+// REGISTER_LITE_KERNEL(sigmoid,
+//                      kOpenCL,
+//                      kFloat,
+//                      kNCHW,
+//                      paddle::lite::kernels::opencl::SigmoidCompute,
+//                      def)
+//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sigmoid,
+    kOpenCL,
+    kFloat,
+    kImageDefault,
+    paddle::lite::kernels::opencl::SigmoidComputeFloatImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sigmoid,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::SigmoidComputeFP16ImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/sigmoid_compute_test.cc b/lite/kernels/opencl/sigmoid_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..77bc03727c5d1e47d2771a04fd5673246d9550de
--- /dev/null
+++ b/lite/kernels/opencl/sigmoid_compute_test.cc
@@ -0,0 +1,426 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void sigmoid_compute_ref(const dtype *x_data,
+                         const DDim &x_dim,
+                         dtype *out_data) {
+  for (int i = 0; i < x_dim.production(); ++i) {
+    out_data[i] = 1 / (1 + expf(-x_data[i]));
+  }
+}
+
+// buffer
+#if 0   // sigmoid_buffer
+TEST(opencl_sigmoid_buffer, compute) {
+  // prepare data
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
+  lite::Tensor x, out;
+  x.Resize(x_dim);
+  out.Resize(x_dim);
+
+  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-10, 10);
+  auto *mapped_x = static_cast<float *>(
+      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    mapped_x[i] = dist(engine);
+  }
+
+  // set param and kernel, then run
+  operators::ActivationParam param;
+  param.X = &x;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  auto kernels = KernelRegistry::Global().Create(
+      "sigmoid", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> sigmoid_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(sigmoid_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(sigmoid_context));
+
+  kernel->Launch();
+
+  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto *out_ptr = param.Out->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto &event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  // run compute ref and check
+  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
+  sigmoid_compute_ref<float>(mapped_x, x_dim, out_ref.get());
+
+  auto *out_data = out.mutable_data<float, cl::Buffer>();
+  auto *mapped_out = static_cast<float *>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+  TargetWrapperCL::Unmap(x_data, mapped_x);
+}
+#endif  // sigmoid_buffer
+
+#define LOOP_TEST
+// #define PRINT_RESULT
+TEST(sigmoid_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
+               "layout(img2buf) "
+               "-> host";
+#ifdef LOOP_TEST
+  for (int n = 1; n <= 9; n += 3) {
+    for (auto c : {1, 3, 9}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+#else
+  const int n = 3;
+  const int c = 9;
+  const int h = 51;
+  const int w = 11;
+#endif  // LOOP_TEST
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          auto sigmoid_img_kernels =
+              KernelRegistry::Global().Create("sigmoid",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(sigmoid_img_kernels.empty());
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+          LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          // layout(buf->img): x -> sigmoid_in
+          // sigmoid(img): sigmoid_in -> sigmoid_out
+          // layout(img->buf): sigmoid_out -> y
+          lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &sigmoid_in;
+          ImageToBufferParam.x = &sigmoid_out;
+          ImageToBufferParam.y = &y;
+          operators::ActivationParam SigmoidParam;
+          SigmoidParam.X = &sigmoid_in;
+          SigmoidParam.Out = &sigmoid_out;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);
+          sigmoid_in.Resize(x_dim);
+          sigmoid_out.Resize(x_dim);
+          y_ref.Resize(x_dim);
+          auto sigmoid_image2d_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(float) * x_dim.production()));
+          std::default_random_engine engine;
+          std::uniform_real_distribution<float> dist(-1, 1);
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<float>(dist(engine));
+          }
+          auto *sigmoid_in_data = sigmoid_in.mutable_data<float, cl::Image2D>(
+              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
+          auto *sigmoid_out_data = sigmoid_out.mutable_data<float, cl::Image2D>(
+              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          sigmoid_img_kernel->SetParam(SigmoidParam);
+          std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(sigmoid_img_context->As<OpenCLContext>()));
+          sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: relu_img_kernel";
+          sigmoid_img_kernel->Launch();
+          LOG(INFO) << "run kernel: img_to_buf_kernel";
+          img_to_buf_kernel->Launch();
+
+          // compute ref cpu
+          sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
+// result
+#ifdef PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // PRINT_RESULT
+
+          // check result: compare kernel output and cpu output(y_data_ref)
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", y_data_ref["
+                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                        << eidx << "]:" << mapped_y[eidx] << ", mapped_x["
+                        << eidx << "]: " << mapped_x[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+#define SIGMOID_FP16_LOOP_TEST
+// #define SIGMOID_FP16_PRINT_RESULT
+TEST(sigmoid_image2d_fp16, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef SIGMOID_FP16_LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+#endif  // SIGMOID_FP16_LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          auto sigmoid_img_kernels =
+              KernelRegistry::Global().Create("sigmoid",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFP16),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(sigmoid_img_kernels.empty());
+
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+          LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          // layout(buf->img): x -> sigmoid_in
+          // sigmoid(img): sigmoid_in -> sigmoid_out
+          // layout(img->buf): sigmoid_out -> y
+          lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &sigmoid_in;
+          ImageToBufferParam.x = &sigmoid_out;
+          ImageToBufferParam.y = &y;
+          operators::ActivationParam SigmoidParam;
+          SigmoidParam.X = &sigmoid_in;
+          SigmoidParam.Out = &sigmoid_out;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);
+          sigmoid_in.Resize(x_dim);
+          sigmoid_out.Resize(x_dim);
+          y_ref.Resize(x_dim);
+          auto sigmoid_image2d_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(float) * x_dim.production()));
+          std::default_random_engine engine;
+          std::uniform_real_distribution<float> dist(-1, 1);
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<float>(dist(engine));
+          }
+          auto *sigmoid_in_data = sigmoid_in.mutable_data<int16_t, cl::Image2D>(
+              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
+          auto *sigmoid_out_data =
+              sigmoid_out.mutable_data<int16_t, cl::Image2D>(
+                  sigmoid_image2d_shape["width"],
+                  sigmoid_image2d_shape["height"]);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          sigmoid_img_kernel->SetParam(SigmoidParam);
+          std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(sigmoid_img_context->As<OpenCLContext>()));
+          sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: sigmoid_img_kernel";
+          sigmoid_img_kernel->Launch();
+          LOG(INFO) << "run kernel: img_to_buf_kernel";
+          img_to_buf_kernel->Launch();
+
+          // compute ref cpu
+          sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
+// result
+#ifdef SIGMOID_FP16_PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // SIGMOID_FP16_PRINT_RESULT
+
+          // check result: compare kernel output and cpu output(y_data_ref)
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-3);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-3) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", y_data_ref["
+                        << eidx << "]: " << y_data_ref[eidx] << ", mapped_y["
+                        << eidx << "]: " << mapped_y[eidx] << ", mapped_x["
+                        << eidx << "]: " << mapped_x[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef SIGMOID_FP16_LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+// sigmoid buffer
+// USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
+
+// sigmoid image2d fp32
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kImageDefault, ImageDefault);
+
+// sigmoid image2d fp16
+USE_LITE_KERNEL(sigmoid, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt
index d40d4ac575a2284b252bcae138d78dfa4769bffc..3d79dc3dfee80613c39f51323e7ba61adcf7cd8a 100644
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -1,3 +1,7 @@
+if(NOT LITE_WITH_X86)
+    return()
+endif()
+
 add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops math_function)
 # lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
@@ -19,15 +23,16 @@ add_kernel(pool_compute_x86 X86 basic SRCS pool_compute.cc DEPS ${lite_kernel_de
 add_kernel(stack_compute_x86 X86 basic SRCS stack_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(dropout_compute_x86 X86 basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(transpose_compute_x86 X86 basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_function)
-# add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(layer_norm_compute_x86 X86 basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
+add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
 # lite_cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(uniform_random_compute_x86 SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps} )
 add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute)
 #add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps})
 
-# lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
 # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
+add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)
 # lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
 # lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
 # lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
@@ -45,7 +50,7 @@ add_kernel(lookup_table_compute_x86 X86 basic SRCS lookup_table_compute.cc DEPS
 add_kernel(sequence_reshape_compute_x86 X86 basic SRCS sequence_reshape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(match_matrix_tensor_compute_x86 X86 basic SRCS match_matrix_tensor_compute.cc DEPS ${lite_kernel_deps} blas math_function)
 add_kernel(search_seq_depadding_compute_x86 X86 basic SRCS search_seq_depadding_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(search_grnn_compute_x86 X86 basic SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(search_grnn_compute_x86 X86 basic SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps} blas math_function)
 add_kernel(sequence_concat_compute_x86 X86 basic SRCS sequence_concat_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(var_conv_2d_compute_x86 X86 basic SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps} blas fluid_data_type)
 add_kernel(attention_padding_mask_compute_x86 X86 basic SRCS attention_padding_mask_compute.cc DEPS ${lite_kernel_deps})
@@ -57,13 +62,11 @@ add_kernel(search_seq_fc_compute_x86 X86 extra SRCS search_seq_fc_compute.cc DEP
 add_kernel(sequence_topk_avg_pooling_compute_x86 X86 basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps} sequence_topk_avg_pooling)
 add_kernel(search_fc_compute_x86 X86 basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps} search_fc)
 
-if(NOT LITE_WITH_X86)
-    return()
-endif()
 add_kernel(matmul_compute_x86 X86 basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} blas)
 
 lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
 lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86)
+lite_cc_test(test_gather_compute_x86 SRCS gather_compute_test.cc DEPS gather_compute_x86)
 lite_cc_test(test_slice_compute_x86 SRCS slice_compute_test.cc DEPS slice_compute_x86)
 lite_cc_test(test_squeeze_compute_x86 SRCS squeeze_compute_test.cc DEPS squeeze_compute_x86)
 lite_cc_test(test_fill_constant_batch_size_like_compute_x86 SRCS fill_constant_batch_size_like_compute_test.cc DEPS fill_constant_batch_size_like_compute_x86)
@@ -83,16 +86,18 @@ lite_cc_test(test_gru_compute_x86 SRCS gru_compute_test.cc DEPS gru_compute_x86)
 lite_cc_test(test_matmul_compute_x86 SRCS matmul_compute_test.cc DEPS matmul_compute_x86)
 lite_cc_test(test_cast_compute_x86 SRCS cast_compute_test.cc DEPS cast_compute_x86)
 lite_cc_test(test_pool2d_compute_x86 SRCS pool_compute_test.cc DEPS pool_compute_x86)
+lite_cc_test(test_layer_norm_compute_x86 SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_x86)
 lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
 lite_cc_test(test_transpose_compute_x86 SRCS transpose_compute_test.cc DEPS transpose_compute_x86)
 lite_cc_test(test_search_fc_compute_x86 SRCS search_fc_compute_test.cc DEPS search_fc_compute_x86)
 lite_cc_test(test_search_seq_depadding_compute_x86 SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_x86)
-
+lite_cc_test(test_search_grnn_compute_x86 SRCS search_grnn_compute_test.cc DEPS search_grnn_compute_x86)
+lite_cc_test(test_match_matrix_compute_x86 SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_x86)
 lite_cc_test(test_lookup_table_compute_x86 SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_x86)
 lite_cc_test(test_stack_compute_x86 SRCS stack_compute_test.cc DEPS stack_compute_x86)
 lite_cc_test(test_search_group_padding_compute_x86 SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_x86)
 lite_cc_test(test_sequence_concat_compute_x86 SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_x86)
-lite_cc_test(test_match_matrix_compute_x86 SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_x86)
 lite_cc_test(test_var_conv_2d_compute_x86 SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_x86)
-lite_cc_test(test_attention_padding_mask_compute_x86 SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_x86)
+#lite_cc_test(test_attention_padding_mask_compute_x86 SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_x86)
 lite_cc_test(test_sequence_arithmetic_compute_x86 SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_x86)
+lite_cc_test(test_leaky_relu_compute_x86 SRCS leaky_relu_compute_test.cc DEPS activation_compute_x86)
diff --git a/lite/kernels/x86/activation_compute.cc b/lite/kernels/x86/activation_compute.cc
index f2f911dd7d037a3f4e0f28592cff07383c8a49b6..2910364f37b74d94977e2397e31eb97fd367825e 100644
--- a/lite/kernels/x86/activation_compute.cc
+++ b/lite/kernels/x86/activation_compute.cc
@@ -36,6 +36,17 @@ REGISTER_LITE_KERNEL(relu,
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
 
+// float
+REGISTER_LITE_KERNEL(leaky_relu,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::LeakyReluCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+
 // float
 REGISTER_LITE_KERNEL(tanh,
                      kX86,
diff --git a/lite/kernels/x86/activation_compute.h b/lite/kernels/x86/activation_compute.h
index 14d0ffe000311c87dac513a65f731e9654042db2..5d8110e67c17f3a0f8d3211179df831dad83cc9b 100644
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
@@ -117,6 +117,40 @@ class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   virtual ~ReluCompute() = default;
 };
 
+template <typename T>
+struct LeakyReluFunctor {
+  float alpha;
+  explicit LeakyReluFunctor(float alpha_) : alpha(alpha_) {}
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+  }
+};
+
+template <typename T>
+class LeakyReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    auto X = param.X;
+    auto Out = param.Out;
+    auto place = lite::fluid::EigenDeviceType<TARGET(kX86)>();
+    CHECK(X);
+    CHECK(Out);
+    auto x = lite::fluid::EigenVector<T>::Flatten(*X);
+    auto out = lite::fluid::EigenVector<T>::Flatten(*Out);
+    LeakyReluFunctor<T> functor(param.Leaky_relu_alpha);
+    functor(place, x, out);
+  }
+
+  virtual ~LeakyReluCompute() = default;
+};
+
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
 struct TanhFunctor : public BaseActivationFunctor<T> {
@@ -188,14 +222,6 @@ class GeluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 };
 
 // softsign(x) = x / (1 + |x|)
-template <typename T>
-struct SoftsignFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) {
-    out.device(d) = x / (static_cast<T>(1) + x.abs());
-  }
-};
-
 template <typename T>
 class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  public:
@@ -204,9 +230,13 @@ class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   void Run() override {
     // auto& context = ctx_->As<X86Context>();
     auto& param = *param_.get_mutable<operators::ActivationParam>();
-    param.Out->template mutable_data<T>();
 
-    Activate<SoftsignFunctor<T>>(param.X, param.Out);
+    const T* x_data = param.X->data<T>();
+    T* out_data = param.Out->mutable_data<T>();
+    size_t x_size = param.X->numel();
+    for (size_t i = 0; i < x_size; i++) {
+      out_data[i] = x_data[i] / (static_cast<T>(1) + std::abs(x_data[i]));
+    }
   }
 
   virtual ~SoftsignCompute() = default;
diff --git a/lite/kernels/x86/attention_padding_mask_compute.cc b/lite/kernels/x86/attention_padding_mask_compute.cc
index 8541fed29839066d9baa6012d0a9723f1b2ed6c9..0c35c416e7771f7896c5378ec8c0199b91ffd685 100644
--- a/lite/kernels/x86/attention_padding_mask_compute.cc
+++ b/lite/kernels/x86/attention_padding_mask_compute.cc
@@ -15,7 +15,7 @@
 #include "lite/kernels/x86/attention_padding_mask_compute.h"
 
 REGISTER_LITE_KERNEL(
-    attention_padding_mask,
+    search_attention_padding_mask,
     kX86,
     kFloat,
     kNCHW,
@@ -23,6 +23,6 @@ REGISTER_LITE_KERNEL(
     def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
diff --git a/lite/kernels/x86/attention_padding_mask_compute.h b/lite/kernels/x86/attention_padding_mask_compute.h
index 04041e3135836b4b1870b26b4d79baa9ae0ca638..b9124e5ad49a0d68c41a21fe55d28102f09d14b9 100644
--- a/lite/kernels/x86/attention_padding_mask_compute.h
+++ b/lite/kernels/x86/attention_padding_mask_compute.h
@@ -36,30 +36,36 @@ class AttentionPaddingMaskCompute
 
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
-    auto src = param.Y;
-    auto attn = param.X;
-    auto src_offset = src->lod()[0];
-    auto attn_offset = attn->lod()[0];
-    int attn_seq_num = attn_offset.size() - 1;
-    int src_seq_num = src_offset.size() - 1;
-    int attn_seq_len = attn_offset[1];
-    int src_seq_len = attn->numel() / attn->dims()[0];
-    size_t count = attn->numel();
-    auto attn_data = attn->data<T>();
-
-    auto out = param.Out;
-    out->Resize(attn->dims());
-    out->set_lod(attn->lod());
-    auto out_data = out->mutable_data<T>();
-    memcpy(out_data, attn_data, count * sizeof(T));
+    auto* bottom0 = param.X;
+    auto* bottom1 = param.Y;
+    auto* _pad_begin = param.pad_begin;
+    auto* top = param.Out;
+    int _pad_id = param.pad_id;
+    float _mask = param.mask;
+    auto src_len = static_cast<int64_t>(bottom1->lod()[0][1]);
+    const int att_batch = bottom0->lod()[0].size() - 1;
+    const int src_batch = bottom1->lod()[0].size() - 1;
+    int* pad_begin = _pad_begin->mutable_data<int>();
+    for (int i = 0; i < src_batch; ++i) {
+      const auto* src_data = bottom1->data<T>() + src_len * i;
+      int index = src_len - 1;
+      for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
+           --index) {
+      }
+      pad_begin[i] = index + 1;
+    }
 
-    for (int i = 0; i < attn_seq_num; ++i) {
-      for (int j = 0; j < attn_seq_len; ++j) {
-        auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j);
-        int src_seq_idx = i % src_seq_num;
-        int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx];
-        for (int k = cur_len; k < src_seq_len; k++) {
-          tmp_out_data[k] = param.mask;
+    const auto att_len = static_cast<int64_t>(bottom0->lod()[0][1]);
+    auto* top_data = top->mutable_data<T>();
+    memcpy(top_data,
+           bottom0->data<T>(),
+           bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T));
+    for (int i = 0; i < att_batch; ++i) {
+      for (int j = 0; j < att_len; ++j) {
+        top_data = top->mutable_data<T>() + src_len * (att_len * i + j);
+        int src_idx = i % src_batch;
+        for (int k = pad_begin[src_idx]; k < src_len; ++k) {
+          top_data[k] = _mask;
         }
       }
     }
diff --git a/lite/kernels/x86/attention_padding_mask_compute_test.cc b/lite/kernels/x86/attention_padding_mask_compute_test.cc
index 958c369266e845842e8d4262c2e1edf0bda0a323..35ce822e010fc3ce2dc756b86e3a437789cc8359 100644
--- a/lite/kernels/x86/attention_padding_mask_compute_test.cc
+++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc
@@ -129,4 +129,4 @@ TEST(attention_padding_mask_x86, run_test) {
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(attention_padding_mask, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(search_attention_padding_mask, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/concat_compute.h b/lite/kernels/x86/concat_compute.h
index 2c6419a3c3186bcd9b6985ac1ba0659ff72fcf6e..935f0811d4e7a7cbe2ce5fafa61b6d16a25d4a81 100644
--- a/lite/kernels/x86/concat_compute.h
+++ b/lite/kernels/x86/concat_compute.h
@@ -39,26 +39,28 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
+    if (param.x.size() == 1) {
+      param.output->ShareDataWith(*param.x[0]);
+      return;
+    }
+
     int64_t axis = static_cast<int64_t>(param.axis);
     auto* axis_tensor = param.axis_tensor;
     if (axis_tensor != nullptr) {
       auto* axis_tensor_data = axis_tensor->data<int>();
       axis = static_cast<int64_t>(axis_tensor_data[0]);
     }
-    auto x_dims = param.x[0]->dims();
-    auto out = param.output;
-    if (param.x.size() == 1) {
-      param.output->ShareDataWith(*param.x[0]);
-      return;
-    }
 
-    auto output_data = param.output->template mutable_data<T>();
+    const auto& x_dims = param.x[0]->dims();
+    auto* out = param.output;
+    T* output_data = param.output->template mutable_data<T>();
+
     int offset_concat_axis = 0;
     int num_concat = count(0, axis, x_dims);
     int concat_input_size = count(axis + 1, x_dims.size(), x_dims);
     const int top_concat_axis = out->dims()[axis];
     for (size_t i = 0; i < param.x.size(); ++i) {
-      auto bottom_data = param.x[i]->data<T>();
+      const T* bottom_data = param.x[i]->data<T>();
       const int64_t bottom_concat_axis = param.x[i]->dims()[axis];
       for (int n = 0; n < num_concat; ++n) {
         std::memcpy(
diff --git a/lite/kernels/x86/conv_compute.h b/lite/kernels/x86/conv_compute.h
index 62f5887947f81c05e996ebcab7d625eef1740404..e9f403059f90cf6635bc22db3e6890b86cbe85f6 100644
--- a/lite/kernels/x86/conv_compute.h
+++ b/lite/kernels/x86/conv_compute.h
@@ -65,9 +65,9 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
       col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
     }
     lite::DDim col_shape(col_shape_vec);
-    lite::DDim col_matrix_shape = col_shape.Flatten2D(data_dim);
+    lite::DDim col_matrix_shape = col_shape.Flatten2D(data_dim + 1);
     bool is_expand = IsExpand(
-        filter_shape_vec, param.strides, param.paddings, param.dilations);
+        filter_shape_vec, param.strides, *param.paddings, *param.dilations);
     lite::Tensor col;
     lite::Tensor col_matrix;
     if (is_expand) {
@@ -103,7 +103,7 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
         lite::Tensor in_slice =
             in_batch.Slice<T>(static_cast<int64_t>(g * in_step),
                               static_cast<int64_t>((g + 1) * in_step));
-
+        auto paddings = *param.paddings;
         if (!is_expand) {
           col.ShareDataWith(in_slice);
           col_matrix.ShareDataWith(col);
@@ -112,20 +112,18 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
           // im2col
           im2col(context,
                  in_slice,
-                 param.dilations,
+                 *param.dilations,
                  param.strides,
-                 std::vector<int>{param.paddings[0],
-                                  param.paddings[1],
-                                  param.paddings[0],
-                                  param.paddings[1]},
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[0], paddings[2]},
                  &(col));
         } else if (data_dim == 3U) {
           // vol2col
           vol2col(context,
                   in_slice,
-                  param.dilations,
+                  *param.dilations,
                   param.strides,
-                  param.paddings,
+                  *param.paddings,
                   &(col));
         }
 
diff --git a/lite/kernels/x86/conv_compute_test.cc b/lite/kernels/x86/conv_compute_test.cc
index f2dde962b9e77ce26336d17f07f29f5874ef9722..2827c6577e5bf311b4002526d4ac10f636162d96 100644
--- a/lite/kernels/x86/conv_compute_test.cc
+++ b/lite/kernels/x86/conv_compute_test.cc
@@ -73,9 +73,11 @@ TEST(conv2d_x86, run_test) {
   param.bias = &b;
   param.output = &out;
   param.strides = {1, 1};
-  param.paddings = {0, 0};
+  std::vector<int> paddings = {0, 0, 0, 0};
   param.groups = 1;
-  param.dilations = {1, 1};
+  std::vector<int> dilations = {1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   LOG(INFO) << 123;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   ctx->As<X86Context>();
diff --git a/lite/kernels/x86/fc_compute.h b/lite/kernels/x86/fc_compute.h
index 620236a4548abedd6eafd384282aa9df62d8dd9c..e719b8d2216949746f612bca0689c22be0606031 100644
--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
@@ -11,68 +11,114 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #pragma once
 
-#include <Eigen/Core>
+#include <vector>
+#include "lite/backends/x86/jit/helper.h"
+#include "lite/backends/x86/jit/kernel_base.h"
+#include "lite/backends/x86/jit/kernels.h"
+#include "lite/backends/x86/math/blas.h"
+#include "lite/backends/x86/parallel.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 #include "lite/operators/fc_op.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace x86 {
 
-template <typename T>
-void fc_compute_eigen(const T* x,
-                      int x_h,
-                      int x_w,  //
-                      const T* w,
-                      int w_h,
-                      int w_w,     //
-                      const T* b,  //
-                      T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-
-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> W(w, w_h, w_w);
-  Eigen::Map<matrix_t> Out(out, x_h, w_w);
-
-  Out = X * W;
-
-  if (b) {
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
-    Out = Out.array().rowwise() + B.transpose().array();
-  }
-}
+template <lite::TargetType Target, typename T>
+class FCFunctor {
+ public:
+  void operator()(const lite::X86Context& context,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T* X,
+                  const T* W,
+                  T* Y,
+                  const T* B = nullptr,
+                  bool relu = false,
+                  bool padding_weights = false) {
+    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
+    T* Y1_data = nullptr;
 
-template <typename T>
-void fc_compute_naive(const T* x,
-                      int x_h,
-                      int x_w,  //
-                      const T* w,
-                      int w_h,
-                      int w_w,     //
-                      const T* b,  //
-                      T* out) {
-  CHECK_EQ(x_w, w_h);
-  // out shape: (x_h, w_w)
-  memset(out, 0, x_h * w_w * sizeof(T));
-  for (int i = 0; i < x_h; i++) {
-    for (int j = 0; j < w_w; j++) {
-      T tmp = static_cast<T>(0);
-      for (int k = 0; k < x_w; k++) {
-        tmp += x[i * x_w + k] * w[k * w_w + j];
+    auto compute =
+        relu
+            ? jit::KernelFuncs<jit::VAddReluTuple<T>, fluid::CPUPlace>::Cache()
+                  .At(N)
+            : jit::KernelFuncs<jit::VAddTuple<T>, fluid::CPUPlace>::Cache().At(
+                  N);
+    auto parallel_compute = [&](int64_t begin, int64_t end) {
+      for (int64_t i = begin; i < end; i++) {
+        T* dst = Y + i * N;
+        T* src = Y1_data ? Y1_data + i * (N + 4) : dst;
+        compute(B, src, dst, N);
+      }
+    };
+
+    // Because of the overhead of memcpy, we only do padding for GEMM
+    //  when weights is already padded in fc_fuse_pass.
+    if (padding_weights) {
+      const int NN = N + 4;
+      const int KK = K + 4;
+
+      // NOTE: here need to mutable_data for temporary Tensor X1 and Y1,
+      //  the overhead is unmeasured.
+      lite::Tensor X1;
+      X1.Resize(std::vector<int64_t>{M * KK});
+      T* X1_data = X1.mutable_data<T>();
+
+      lite::Tensor Y1;
+      Y1.Resize(std::vector<int64_t>{M * NN});
+      Y1_data = Y1.mutable_data<T>();
+
+      auto parallel_memcpy_x = [&](int64_t begin, int64_t end) {
+        for (int64_t i = begin; i < end; i++) {
+          memcpy(X1_data + i * KK, X + i * K, K * sizeof(T));
+        }
+      };
+      lite::x86::RunParallelFor(0, M, parallel_memcpy_x);
+
+      blas.GEMM(false,
+                false,
+                M,
+                N,
+                K,
+                static_cast<T>(1.0),
+                X1_data,
+                KK,
+                W,
+                NN,
+                static_cast<T>(0.0),
+                Y1_data,
+                NN);
+
+      if (!B) {
+        auto parallel_memcpy_y = [&](int64_t begin, int64_t end) {
+          for (int64_t i = begin; i < end; i++) {
+            memcpy(Y + i * N, Y1_data + i * NN, N * sizeof(T));
+          }
+        };
+        lite::x86::RunParallelFor(0, M, parallel_memcpy_y);
+        return;
       }
-      out[i * w_w + j] = tmp + b[j];
+
+      lite::x86::RunParallelFor(0, M, parallel_compute);
+    } else {
+      blas.MatMul(M, N, K, X, W, Y);
+      if (!B) {
+        return;
+      }
+
+      lite::x86::RunParallelFor(0, M, parallel_compute);
     }
   }
-}
+};
 
 template <typename T>
 class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
@@ -81,20 +127,35 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
-    CHECK_GE(param.input->dims().size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
-
-    fc_compute_eigen(
-        param.input->data<T>(),  // x
-        param.input->dims().Slice(0, param.in_num_col_dims).production(),
-        param.input->dims()
-            .Slice(param.in_num_col_dims, param.input->dims().size())
-            .production(),
-        param.w->data<T>(),     // w
-        param.w->dims()[0],     // w_h
-        param.w->dims()[1],     // w_w
-        param.bias->data<T>(),  // b
-        param.output->mutable_data<T>());
+    auto* input = param.input;
+    auto* w = param.w;
+    auto* bias = param.bias;
+    auto* output = param.output;
+    bool with_relu = (param.activation_type == "relu") ? true : false;
+
+    bool padding_weights = param.padding_weights;
+    const auto& w_dims = w->dims();
+    auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
+    auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
+
+    int M = output->dims().production() / w_dims1;
+
+    const T* input_data = input->data<T>();
+    const T* w_data = w->data<T>();
+    T* output_data = output->mutable_data<T>();
+
+    auto& context = ctx_->As<X86Context>();
+    FCFunctor<lite::TargetType::kX86, T> fc;
+    fc(context,
+       M,
+       w_dims1,
+       w_dims0,
+       input_data,
+       w_data,
+       output_data,
+       bias ? bias->data<T>() : NULL,
+       with_relu,
+       padding_weights);
   }
 
   virtual ~FcCompute() = default;
diff --git a/lite/kernels/x86/fc_compute_test.cc b/lite/kernels/x86/fc_compute_test.cc
deleted file mode 100644
index abc0597457b7bc8ccd5e9f760ebd28197d7a85d5..0000000000000000000000000000000000000000
--- a/lite/kernels/x86/fc_compute_test.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/x86/fc_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(fc_x86, retrive_op) {
-  auto fc =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("fc");
-  ASSERT_FALSE(fc.empty());
-  ASSERT_TRUE(fc.front());
-}
-
-TEST(fc_x86, init) {
-  FcCompute<float> fc;
-  ASSERT_EQ(fc.precision(), PRECISION(kFloat));
-  ASSERT_EQ(fc.target(), TARGET(kX86));
-}
-
-TEST(fc_x86, run_test) {
-  lite::Tensor x, w, b, out;
-  constexpr int batch_size = 2;
-  std::vector<int64_t> x_shape{batch_size, 3};
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> w_shape{3, 4};
-  w.Resize(lite::DDim(w_shape));
-  std::vector<int64_t> b_shape{1, 4};
-  b.Resize(lite::DDim(b_shape));
-  std::vector<int64_t> out_shape{1, 4};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto w_data = w.mutable_data<float>();
-  auto b_data = b.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < w.dims().production(); i++) {
-    w_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < b.dims().production(); i++) {
-    b_data[i] = static_cast<float>(i);
-  }
-
-  /* lite::x86::math::fc_compute_eigen(x_data, batch_size, 3,  //
-                                     w_data, 3, 4,           //
-                                     b_data, ref_data); */
-
-  // FcCompute fc;
-  FcCompute<float> fc;
-  operators::FcParam param;
-
-  param.in_num_col_dims = 1;
-  param.input = &x;
-  param.w = &w;
-  param.bias = &b;
-  param.output = &out;
-  param.in_mat_dims = x.dims();
-
-  // std::unique_ptr<KernelContext> ctx(new KernelContext);
-  // ctx->As<X86Context>();
-  fc.SetParam(param);
-  // fc.SetContext(std::move(ctx));
-  fc.Run();
-
-  VLOG(3) << "output vs ref";
-  for (int i = 0; i < out.dims().production(); i++) {
-    VLOG(3) << out_data[i];
-  }
-
-  /* for (int i = 0; i < out.dims().production(); ++i) {
-     EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
-   }*/
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/fill_constant_compute.cc b/lite/kernels/x86/fill_constant_compute.cc
index 1eb76332ccc21b0c5196d71b9246ed8b144a6593..dace1e90258a93aa5c8e89d1d9369adf39416659 100644
--- a/lite/kernels/x86/fill_constant_compute.cc
+++ b/lite/kernels/x86/fill_constant_compute.cc
@@ -29,6 +29,38 @@ class FillConstantCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  public:
   using param_t = operators::FillConstantParam;
 
+  inline DDimLite GetShape(const param_t& param) {
+    // 1. shape is a Tensor
+    if (param.shape_tensor != nullptr) {
+      auto* shape_tensor = param.shape_tensor;
+      auto* shape_data = shape_tensor->data<int>();
+      auto vec_shape =
+          std::vector<int64_t>(shape_data, shape_data + shape_tensor->numel());
+      return DDimLite(vec_shape);
+    }
+
+    // 2. shape is a list/tuple containing Tensor
+    auto shape_tensor_list = param.shape_tensor_list;
+    if (shape_tensor_list.size() > 0) {
+      std::vector<int64_t> vec_shape;
+      for (size_t i = 0; i < shape_tensor_list.size(); ++i) {
+        auto tensor = shape_tensor_list[i];
+        vec_shape.push_back(*tensor->data<int>());
+      }
+      return DDimLite(vec_shape);
+    }
+
+    // 3. shape is a list/tuple without containing Tensor
+    auto vec_shape = param.shape;
+    return DDimLite(vec_shape);
+  }
+
+  void PrepareForRun() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto outdims = GetShape(param);
+    param.Out->Resize(outdims);
+  }
+
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
     auto& context = ctx_->As<X86Context>();
@@ -55,5 +87,9 @@ REGISTER_LITE_KERNEL(fill_constant,
                      kNCHW,
                      paddle::lite::kernels::x86::FillConstantCompute<float>,
                      def)
+    .BindInput("ShapeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("ShapeTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
diff --git a/lite/kernels/x86/gather_compute.cc b/lite/kernels/x86/gather_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..836f336271ef53c338cca89855b48c94c778cc54
--- /dev/null
+++ b/lite/kernels/x86/gather_compute.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/gather_compute.h"
+
+typedef paddle::lite::kernels::x86::GatherCompute<float, int32_t> GatherInt32;
+typedef paddle::lite::kernels::x86::GatherCompute<float, int64_t> GatherInt64;
+
+REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt64, int64_in)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/gather_compute.h b/lite/kernels/x86/gather_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ee270647f8fb7d7ec540047cd4d546a7eb89ce8
--- /dev/null
+++ b/lite/kernels/x86/gather_compute.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/fluid/data_type.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+/**
+ * A thin wrapper for gathering on cpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-IndexT index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T, typename IndexT = int>
+void CPUGather(const lite::Tensor* src,
+               const lite::Tensor* index,
+               lite::Tensor* output) {
+  // check index of shape 1-D
+  if (index->dims().size() == 2) {
+    CHECK(index->dims()[1] == 1) << "Index(Input)'s dimension[1] should be 1 "
+                                    "when Index(input)'s dimension's size "
+                                    "equal to 2 in Gather(Op).";
+  } else {
+    CHECK(index->dims().size() == 1)
+        << "Index(Input)'s dimension's size() should be 1 or 2 in Gather(Op).";
+  }
+  int64_t index_size = index->dims()[0];
+
+  auto src_dims = src->dims();
+
+  const T* p_src = src->data<T>();
+  const IndexT* p_index = index->data<IndexT>();
+  T* p_output = output->mutable_data<T>();
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const size_t slice_bytes = slice_size * sizeof(T);
+  for (int64_t i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
+  }
+}
+
+template <typename T, typename IndexT>
+class GatherCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GatherParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+
+    auto x = param.X;
+    auto index = param.Index;
+    auto out = param.Out;
+
+    out->mutable_data<T>();
+    if (x->dims().production() == 0) return;
+    /*
+     * Since there's no type defined for lite::Tensor in Paddle-Lite, then
+     * convert the Index's value to float which must be int32_t or int64_t and
+     * this supposes to cause no precision difference during inference just for
+     * now.
+     * Alternatively, if define the Tensor's type during registering, may cause
+     * a redefinition error.
+     */
+    CPUGather<T, IndexT>(x, index, out);
+  }
+
+  virtual ~GatherCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/gather_compute_test.cc b/lite/kernels/x86/gather_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..286dfcb08a0c2c7bc038e0ad3b5673bd7c0f8b19
--- /dev/null
+++ b/lite/kernels/x86/gather_compute_test.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/gather_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(gather_x86, retrive_op) {
+  auto gather =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "gather");
+  ASSERT_FALSE(gather.empty());
+  int cnt = 0;
+  for (auto item = gather.begin(); item != gather.end(); ++item) {
+    cnt++;
+    ASSERT_TRUE(*item);
+  }
+  ASSERT_EQ(cnt, 2);
+}
+
+TEST(gather_x86, int32_init) {
+  GatherCompute<float, int32_t> gather;
+  ASSERT_EQ(gather.precision(), PRECISION(kFloat));
+  ASSERT_EQ(gather.target(), TARGET(kX86));
+}
+
+TEST(gather_x86, int64_init) {
+  GatherCompute<float, int64_t> gather;
+  ASSERT_EQ(gather.precision(), PRECISION(kFloat));
+  ASSERT_EQ(gather.target(), TARGET(kX86));
+}
+
+template <typename T>
+void test_case_1dims() {
+  lite::Tensor x, index, out;
+  std::vector<int64_t> x_shape{10};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> index_shape{3};
+  index.Resize(lite::DDim(index_shape));
+  std::vector<int64_t> out_shape{3};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto index_data = index.mutable_data<T>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  std::vector<float> index_value{1, 3, 5};
+  for (int i = 0; i < index.dims().production(); ++i) {
+    index_data[i] = static_cast<T>(index_value[i]);
+  }
+
+  GatherCompute<float, T> gather;
+  operators::GatherParam param;
+
+  param.X = &x;
+  param.Index = &index;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  gather.SetContext(std::move(ctx));
+  gather.SetParam(param);
+  gather.Run();
+
+  std::vector<float> ref_data{1, 3, 5};
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  }
+}
+
+template <typename T>
+void test_case_2dims() {
+  lite::Tensor x, index, out;
+  std::vector<int64_t> x_shape{10, 20};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> index_shape{3};
+  index.Resize(lite::DDim(index_shape));
+  std::vector<int64_t> out_shape{3, 20};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto index_data = index.mutable_data<T>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  std::vector<float> index_value{1, 3, 5};
+  for (int i = 0; i < index.dims().production(); ++i) {
+    index_data[i] = static_cast<T>(index_value[i]);
+  }
+
+  GatherCompute<float, T> gather;
+  operators::GatherParam param;
+
+  param.X = &x;
+  param.Index = &index;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  gather.SetContext(std::move(ctx));
+  gather.SetParam(param);
+  gather.Run();
+
+  std::vector<float> ref_data(60);
+  for (int i = 0; i < 20; ++i) {
+    ref_data[i] = static_cast<float>(20 + i);
+  }
+  for (int i = 20; i < 40; ++i) {
+    ref_data[i] = static_cast<float>(40 + i);
+  }
+  for (int i = 40; i < 60; ++i) {
+    ref_data[i] = static_cast<float>(60 + i);
+  }
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  }
+}
+
+TEST(gather_x86, run_test_1dims) {
+  test_case_1dims<int32_t>();
+  test_case_1dims<int64_t>();
+}
+
+TEST(gather_x86, run_test_2dims) {
+  test_case_2dims<int32_t>();
+  test_case_2dims<int64_t>();
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, int64_in);
diff --git a/lite/kernels/x86/gru_compute.cc b/lite/kernels/x86/gru_compute.cc
index d8e70833aaa9b4e2914c13f3ae40c84a5083c909..23842957fa7bfc6b0710a5bd9b8644d888a7e7b4 100644
--- a/lite/kernels/x86/gru_compute.cc
+++ b/lite/kernels/x86/gru_compute.cc
@@ -13,10 +13,13 @@
 // limitations under the License.
 
 #include "lite/kernels/x86/gru_compute.h"
+#include "lite/utils/env.h"
 
-DEFINE_int32(paddle_num_threads,
-             1,
-             "Number of threads for each paddle instance.");
+// DEFINE_int32(paddle_num_threads,
+//              1,
+//              "Number of threads for each paddle instance.");
+int32_t paddle_num_threads =
+    paddle::lite::GetIntFromEnv("paddle_num_threads", 1);
 
 REGISTER_LITE_KERNEL(gru,
                      kX86,
diff --git a/lite/kernels/x86/gru_compute.h b/lite/kernels/x86/gru_compute.h
index e3c6f70fdbe3d0e0ff025c7b41528b50ff06fca3..89076b51dae1fed4b8f56b280f177caf1f142158 100644
--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
@@ -26,7 +26,8 @@
 #include "lite/core/types.h"
 #include "lite/fluid/eigen.h"
 
-DECLARE_int32(paddle_num_threads);
+// DECLARE_int32(paddle_num_threads);
+extern int32_t paddle_num_threads;
 
 namespace paddle {
 namespace lite {
@@ -47,6 +48,10 @@ inline void ReorderInitState(const lite::Context<TARGET(kX86)>& context,
   row_shuffle(context, src, index_lod, dst, indexed_src);
 }
 
+static inline int64_t CalculateSeqWidth(const DDim& dims) {
+  return dims.count(1, dims.size());
+}
+
 template <typename T>
 class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  public:
@@ -64,15 +69,16 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto* bias = param.bias;
 
     auto* batch_gate = param.batch_gate;
-    batch_gate->mutable_data<T>();
     auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
-    batch_reset_hidden_prev->mutable_data<T>();
     auto* batch_hidden = param.batch_hidden;
-    batch_hidden->mutable_data<T>();
+    T* batch_gate_ptr = batch_gate->mutable_data<T>();
+    T* batch_reset_hidden_prev_ptr = batch_reset_hidden_prev->mutable_data<T>();
+    T* batch_hidden_ptr = batch_hidden->mutable_data<T>();
+
     auto* hidden = param.hidden;
     hidden->mutable_data<T>();
 
-    auto hidden_dims = hidden->dims();
+    const auto& hidden_dims = hidden->dims();
 
     lite::x86::math::LoDTensor2BatchFunctor<TARGET(kX86), T> to_batch;
     to_batch(context, *input, batch_gate, true, is_reverse);
@@ -89,19 +95,23 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
     Tensor ordered_h0;
 
-    std::vector<size_t> order(batch_gate->lod()[2]);
-
     if (h0) {
       // Since the batch computing for GRU reorders the input sequences
       // according to their length. The initialized cell state also needs
       // to reorder.
+      const std::vector<size_t>& order(batch_gate->lod()[2]);
       ReorderInitState<T>(context, *h0, order, &ordered_h0, true);
       gru_value.prev_out_value = ordered_h0.mutable_data<T>();
     } else {
       gru_value.prev_out_value = nullptr;
     }
-    auto batch_starts = batch_gate->lod()[0];
+
+    const auto& batch_starts = batch_gate->lod()[0];
     size_t seq_len = batch_starts.size() - 1;
+    int64_t batch_gate_width = CalculateSeqWidth(batch_gate->dims());
+    int64_t batch_reset_hidden_prev_width =
+        CalculateSeqWidth(batch_reset_hidden_prev->dims());
+    int64_t batch_hidden_width = CalculateSeqWidth(batch_hidden->dims());
     auto active_node =
         lite::x86::math::detail::GetActivationType(param.activation);
     auto active_gate =
@@ -109,7 +119,7 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
 #ifdef PADDLE_WITH_MKLML
     // use MKL packed to speedup GEMM
-    if (FLAGS_paddle_num_threads >= 4) {
+    if (paddle_num_threads >= 4) {
       auto blas = lite::x86::math::GetBlas<TARGET(kX86), T>(context);
       T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix,
                                        1 /*height of C*/,
@@ -144,13 +154,10 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
         int64_t bend = static_cast<int64_t>(batch_starts[n + 1]);
         int64_t cur_batch_size = bend - bstart;
 
-        Tensor gate_t = batch_gate->Slice<T>(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice<T>(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice<T>(bstart, bend);
-        gru_value.output_value = hidden_t.mutable_data<T>();
-        gru_value.gate_value = gate_t.mutable_data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.mutable_data<T>();
+        gru_value.output_value = batch_hidden_ptr + bstart * batch_hidden_width;
+        gru_value.gate_value = batch_gate_ptr + bstart * batch_gate_width;
+        gru_value.reset_output_value = batch_reset_hidden_prev_ptr +
+                                       bstart * batch_reset_hidden_prev_width;
 
         if (gru_value.prev_out_value) {
           blas.GEMM_COMPUTE(CblasNoTrans,
@@ -187,13 +194,10 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
         int64_t bend = static_cast<int64_t>(batch_starts[n + 1]);
         int64_t cur_batch_size = bend - bstart;
 
-        Tensor gate_t = batch_gate->Slice<T>(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice<T>(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice<T>(bstart, bend);
-        gru_value.output_value = hidden_t.mutable_data<T>();
-        gru_value.gate_value = gate_t.mutable_data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.mutable_data<T>();
+        gru_value.output_value = batch_hidden_ptr + bstart * batch_hidden_width;
+        gru_value.gate_value = batch_gate_ptr + bstart * batch_gate_width;
+        gru_value.reset_output_value = batch_reset_hidden_prev_ptr +
+                                       bstart * batch_reset_hidden_prev_width;
 
         lite::x86::math::GRUUnitFunctor<TARGET(kX86), T>::compute(
             context,
diff --git a/lite/kernels/x86/relu_compute.cc b/lite/kernels/x86/layer_norm_compute.cc
similarity index 63%
rename from lite/kernels/x86/relu_compute.cc
rename to lite/kernels/x86/layer_norm_compute.cc
index 684b1442540637d5aadfbdd124ca2195bd7a0ca5..4854a69a1d5f38bff102d984f990aea4ad723439 100644
--- a/lite/kernels/x86/relu_compute.cc
+++ b/lite/kernels/x86/layer_norm_compute.cc
@@ -12,14 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/relu_compute.h"
+#include "lite/kernels/x86/layer_norm_compute.h"
 
-REGISTER_LITE_KERNEL(relu,
+REGISTER_LITE_KERNEL(layer_norm,
                      kX86,
                      kFloat,
                      kNCHW,
-                     paddle::lite::kernels::x86::ReluCompute<float>,
+                     paddle::lite::kernels::x86::LayerNormCompute<float>,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca2ddf60c5e150ba7d2712ccb2e67e444cd07010
--- /dev/null
+++ b/lite/kernels/x86/layer_norm_compute.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/backends/x86/jit/helper.h"
+#include "lite/backends/x86/jit/kernel_base.h"
+#include "lite/backends/x86/jit/kernels.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/layer_norm_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::LayerNormParam;
+
+  void Run() override {
+    auto &param = *param_.get_mutable<param_t>();
+    float epsilon = param.epsilon;
+    auto Scale = param.Scale;
+    auto Bias = param.Bias;
+    auto x = param.X;
+
+    auto y = param.Y;
+    auto Mean = param.Mean;
+    auto Var = param.Variance;
+    auto begin_norm_axis = param.begin_norm_axis;
+
+    auto x_dims = x->dims();
+
+    y->mutable_data<T>();
+    Mean->mutable_data<T>();
+    Var->mutable_data<T>();
+
+    auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    lite::DDim matrix_shape({left, right});
+
+    lite::Tensor in;
+    in.ShareDataWith(*x);
+    in.Resize(matrix_shape);
+    lite::Tensor out;
+    out.ShareDataWith(*y);
+    out.Resize(matrix_shape);
+
+    PADDLE_ENFORCE_EQ(Mean->numel(), left);
+    PADDLE_ENFORCE_EQ(Var->numel(), left);
+    PADDLE_ENFORCE_EQ(Scale->numel(), right);
+    PADDLE_ENFORCE_EQ(Bias->numel(), right);
+
+    auto ker = paddle::lite::jit::KernelFuncs<jit::LayerNormTuple<T>,
+                                              lite::fluid::CPUPlace>::Cache()
+                   .At(right);
+    ker(in.mutable_data<T>(),
+        out.mutable_data<T>(),
+        Mean->mutable_data<T>(),
+        Var->mutable_data<T>(),
+        Scale->data<T>(),
+        Bias->data<T>(),
+        static_cast<int>(left),
+        epsilon,
+        right);
+  }
+
+  virtual ~LayerNormCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/layer_norm_compute_test.cc b/lite/kernels/x86/layer_norm_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5244bcc6f2c561b5eac2fc74b1cc8c5f12417d6
--- /dev/null
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/layer_norm_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/backends/x86/jit/helper.h"
+#include "lite/backends/x86/jit/kernel_base.h"
+#include "lite/backends/x86/jit/kernels.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+std::vector<float> ref(lite::Tensor* x,
+                       lite::Tensor* Scale,
+                       lite::Tensor* Bias,
+                       lite::Tensor* y,
+                       lite::Tensor* Mean,
+                       lite::Tensor* Var,
+                       int begin_norm_axis,
+                       float epsilon) {
+  auto x_dims = x->dims();
+
+  y->mutable_data<float>();
+  Mean->mutable_data<float>();
+  Var->mutable_data<float>();
+
+  auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  lite::DDim matrix_shape({left, right});
+
+  x->Resize(matrix_shape);
+  Tensor out;
+  out.ShareDataWith(*y);
+  out.Resize(matrix_shape);
+
+  auto ker = paddle::lite::jit::KernelFuncs<jit::LayerNormTuple<float>,
+                                            lite::fluid::CPUPlace>::Cache()
+                 .At(right);
+  ker(x->mutable_data<float>(),
+      out.mutable_data<float>(),
+      Mean->mutable_data<float>(),
+      Var->mutable_data<float>(),
+      Scale->data<float>(),
+      Bias->data<float>(),
+      static_cast<int>(left),
+      static_cast<const float>(epsilon),
+      right);
+
+  std::vector<float> ref_data;
+  auto result = out.mutable_data<float>();
+  for (int i = 0; i < y->dims().production(); ++i) {
+    ref_data.emplace_back(result[i]);
+  }
+  return ref_data;
+}
+
+// layer_norm
+TEST(layer_norm_x86, retrive_op) {
+  auto layer_norm =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "layer_norm");
+  ASSERT_FALSE(layer_norm.empty());
+  ASSERT_TRUE(layer_norm.front());
+}
+
+TEST(layer_norm_x86, init) {
+  lite::kernels::x86::LayerNormCompute<float> layer_norm;
+  ASSERT_EQ(layer_norm.precision(), PRECISION(kFloat));
+  ASSERT_EQ(layer_norm.target(), TARGET(kX86));
+}
+
+TEST(layer_norm_x86, run_test) {
+  lite::Tensor x;
+  lite::Tensor Scale;
+  lite::Tensor Bias;
+
+  lite::Tensor out;
+  lite::Tensor Mean;
+  lite::Tensor Var;
+
+  std::vector<int64_t> x_shape({1, 2, 3, 1});
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape({1, 2, 3, 1});
+  out.Resize(lite::DDim(out_shape));
+
+  int begin_norm_axis = 0;
+  float epsilon = 1e-5;
+  int pre = 1;
+  int post = 1;
+  for (int i = 0; i < begin_norm_axis; ++i) {
+    pre *= x_shape[i];
+  }
+  for (int i = begin_norm_axis; i < x_shape.size(); ++i) {
+    post *= x_shape[i];
+  }
+  std::vector<int64_t> scale_shape({post});
+  Scale.Resize(scale_shape);
+  std::vector<int64_t> bias_shape({post});
+  Bias.Resize(bias_shape);
+
+  auto x_data = x.mutable_data<float>();
+  auto scale_data = Scale.mutable_data<float>();
+  auto bias_data = Bias.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  auto mean_data = Mean.mutable_data<float>();
+  auto var_data = Var.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < Scale.dims().production(); ++i) {
+    scale_data[i] = 1.5;
+  }
+  for (int64_t i = 0; i < Bias.dims().production(); ++i) {
+    bias_data[i] = 0.25;
+  }
+
+  LayerNormCompute<float> layer_norm;
+  operators::LayerNormParam param;
+
+  param.X = &x;
+  param.Y = &out;
+  param.Scale = &Scale;
+  param.Bias = &Bias;
+  param.Mean = &Mean;
+  param.Variance = &Var;
+  param.begin_norm_axis = begin_norm_axis;
+  param.epsilon = epsilon;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  layer_norm.SetContext(std::move(ctx));
+  layer_norm.SetParam(param);
+  layer_norm.Run();
+
+  std::vector<float> ref_data =
+      ref(&x, &Scale, &Bias, &out, &Mean, &Var, begin_norm_axis, epsilon);
+  for (int j = 0; j < out.dims().production(); ++j) {
+    EXPECT_NEAR(out_data[j], ref_data[j], 1e-5);
+  }
+  LOG(INFO) << *mean_data;
+  LOG(INFO) << *var_data;
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(layer_norm, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/leaky_relu_compute_test.cc b/lite/kernels/x86/leaky_relu_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0885fb00e3bf4f1c0383e06f5e4da7c919f21e30
--- /dev/null
+++ b/lite/kernels/x86/leaky_relu_compute_test.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/x86/activation_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(leaky_relu_x86, retrive_op) {
+  auto leaky_relu =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "leaky_relu");
+  ASSERT_FALSE(leaky_relu.empty());
+  ASSERT_TRUE(leaky_relu.front());
+}
+
+TEST(leaky_relu_x86, init) {
+  LeakyReluCompute<float> leaky_relu;
+  ASSERT_EQ(leaky_relu.precision(), PRECISION(kFloat));
+  ASSERT_EQ(leaky_relu.target(), TARGET(kX86));
+}
+
+TEST(leaky_relu_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i) / 12.0 - 0.5;
+  }
+  LeakyReluCompute<float> leaky_relu;
+  operators::ActivationParam param;
+
+  param.X = &x;
+  param.Out = &out;
+  param.Leaky_relu_alpha = 0.05;
+
+  leaky_relu.SetParam(param);
+  leaky_relu.Run();
+
+  std::vector<float> ref_data({-0.025,
+                               -0.02083333,
+                               -0.01666667,
+                               -0.0125,
+                               -0.00833333,
+                               -0.00416667,
+                               0.,
+                               0.08333334,
+                               0.16666667,
+                               0.25,
+                               0.33333334,
+                               0.41666666});
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-05);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(leaky_relu, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/lookup_table_compute.cc b/lite/kernels/x86/lookup_table_compute.cc
index 856a07a94cada4702d47820605436cee6523a527..baac8c47a6367e7e89268f5763964dba1a1bd1dd 100644
--- a/lite/kernels/x86/lookup_table_compute.cc
+++ b/lite/kernels/x86/lookup_table_compute.cc
@@ -24,7 +24,7 @@
 //,
 REGISTER_LITE_KERNEL(lookup_table,
                      kX86,
-                     kInt64,
+                     kFloat,
                      kNCHW,
                      paddle::lite::kernels::x86::LookupTableCompute<float>,
                      def)
@@ -34,7 +34,7 @@ REGISTER_LITE_KERNEL(lookup_table,
     .Finalize();
 REGISTER_LITE_KERNEL(lookup_table_v2,
                      kX86,
-                     kInt64,
+                     kFloat,
                      kNCHW,
                      paddle::lite::kernels::x86::LookupTableCompute<float>,
                      def)
diff --git a/lite/kernels/x86/lookup_table_compute.h b/lite/kernels/x86/lookup_table_compute.h
index 019544850309f8db306857f5f2767b4baaad9bb0..1801144f6eeb25a40fa052440b63913bc41a65a3 100644
--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
@@ -24,7 +24,7 @@ namespace kernels {
 namespace x86 {
 
 template <typename T>
-class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kInt64)> {
+class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  public:
   using param_t = operators::LookupTableParam;
 
@@ -33,25 +33,25 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kInt64)> {
     auto *ids_t = param.Ids;
     auto *output_t = param.Out;
     int64_t padding_idx = param.padding_idx;
-    auto *ids = ids_t->data<int64_t>();
+    const int64_t *ids = ids_t->data<int64_t>();
     int64_t ids_numel = ids_t->dims().production();
 
     auto *table_t = param.W;
     int64_t row_number = table_t->dims()[0];
     int64_t row_width = table_t->dims()[1];
 
-    auto *table = table_t->data<float>();
-    auto *output = output_t->mutable_data<float>();
-    memset(output, 0, output_t->dims().production() * sizeof(float));
+    const T *table = table_t->data<T>();
+    T *output = output_t->mutable_data<T>();
+    memset(output, 0, output_t->dims().production() * sizeof(T));
     for (int64_t i = 0; i < ids_numel; ++i) {
       if (padding_idx != -1 && ids[i] == padding_idx) {
-        memset(output + i * row_width, 0, row_width * sizeof(float));
+        memset(output + i * row_width, 0, row_width * sizeof(T));
       } else {
         CHECK_LT(ids[i], row_number);
         CHECK_GE(ids[i], 0);
         memcpy(output + i * row_width,
                table + ids[i] * row_width,
-               row_width * sizeof(float));
+               row_width * sizeof(T));
       }
     }
   }
diff --git a/lite/kernels/x86/lookup_table_compute_test.cc b/lite/kernels/x86/lookup_table_compute_test.cc
index 86b2d39186b10de6def72a217cd6c70773b59420..9c11c672408c64775e0d0c5c66f88f51294a1f5d 100644
--- a/lite/kernels/x86/lookup_table_compute_test.cc
+++ b/lite/kernels/x86/lookup_table_compute_test.cc
@@ -79,4 +79,4 @@ TEST(lookup_table_x86, compute) {
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(lookup_table, kX86, kInt64, kNCHW, def);
+USE_LITE_KERNEL(lookup_table, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/match_matrix_tensor_compute.cc b/lite/kernels/x86/match_matrix_tensor_compute.cc
index a0b4160c3ae8b9646d376cfbd0080d45e2276969..feda180d22e59b2ca0e8f0f89f3c7a1ddb8acd4a 100644
--- a/lite/kernels/x86/match_matrix_tensor_compute.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute.cc
@@ -94,8 +94,31 @@ void MatchMatrixTensorCompute<T>::Run() {
     }
   }
 
+  int batch_size = x->lod()[0].size() - 1;
+  int lod_lv1_size = batch_size * dim_t;
+  int lod_lv2_size = x->lod()[0].back() * dim_t;
+  std::vector<size_t> out_lod0(batch_size + 1, 0);
+  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
+  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+  for (int i = 0; i < batch_size; i++) {
+    out_lod0[i + 1] = out_lod0[i] + dim_t;
+    int len_l = offset_l[i + 1] - offset_l[i];
+
+    for (int j = 0; j < dim_t; j++) {
+      out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
+      int len_r = offset_r[i + 1] - offset_r[i];
+
+      for (int k = 0; k < len_l; k++) {
+        out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
+            out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
+      }
+    }
+  }
+
   LoD out_lod;
   out_lod.push_back(top_offset);
+  out_lod.push_back(offset_l);
+  out_lod.push_back(offset_r);
   out->set_lod(out_lod);
 }
 
diff --git a/lite/kernels/x86/mean_compute.cc b/lite/kernels/x86/mean_compute.cc
index b618d2d3775e148c4b5f2c864eaa4de2dc40c08a..1216d99ad807c673ee6aa764fd895732540d86c5 100644
--- a/lite/kernels/x86/mean_compute.cc
+++ b/lite/kernels/x86/mean_compute.cc
@@ -54,29 +54,6 @@ class MeanCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   virtual ~MeanCompute() = default;
 };
 
-template <typename T>
-class MeanGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MeanGradParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK_EQ(param.Out_grad->raw_tensor().numel(), 1);
-    CHECK(context.x86_device_context());
-
-    param.X_grad->template mutable_data<T>();
-    T x_grad_size = static_cast<T>(param.X_grad->raw_tensor().numel());
-    Eigen::DSizes<int, 1> bcast(static_cast<int>(x_grad_size));
-    EigenVector<T>::Flatten(param.X_grad->raw_tensor())
-        .device(*(context.x86_device_context()->eigen_device())) =
-        (EigenVector<T>::From(param.Out_grad->raw_tensor()) / x_grad_size)
-            .broadcast(bcast);
-  }
-
-  virtual ~MeanGradCompute() = default;
-};
-
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
@@ -93,16 +70,3 @@ REGISTER_LITE_KERNEL(mean,
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
-
-REGISTER_LITE_KERNEL(mean_grad,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::MeanGradCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput(paddle::framework::GradVarName("Out"),
-               {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput(paddle::framework::GradVarName("X"),
-                {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/mul_compute.cc b/lite/kernels/x86/mul_compute.cc
index 64558f66772381ad402a3eb203bb6efd9fceff60..3de4340543cff6867f7879f0551be7a33c9e6862 100644
--- a/lite/kernels/x86/mul_compute.cc
+++ b/lite/kernels/x86/mul_compute.cc
@@ -24,21 +24,3 @@ REGISTER_LITE_KERNEL(mul,
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
-
-// #ifdef LITE_WITH_TRAIN
-// REGISTER_LITE_KERNEL(mul_grad,
-//                      kX86,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::x86::MulGradCompute<float>,
-//                      def)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindInput(paddle::framework::GradVarName("Out"),
-//                {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindOutput(paddle::framework::GradVarName("X"),
-//                 {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindOutput(paddle::framework::GradVarName("Y"),
-//                 {LiteType::GetTensorTy(TARGET(kX86))})
-//     .Finalize();
-// #endif
diff --git a/lite/kernels/x86/mul_compute.h b/lite/kernels/x86/mul_compute.h
index e204fc81f28de4af43d63e289b01d81188502988..be58f24ba2ed37db6661ecaaceb0d9d70fdd75d4 100644
--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
@@ -81,78 +81,6 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   virtual ~MulCompute() = default;
 };
 
-#ifdef LITE_WITH_TRAIN
-template <typename T>
-class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::MulGradParam>();
-    CHECK(context.x86_device_context());
-
-    auto* x = &param.x->raw_tensor();
-    auto* y = &param.y->raw_tensor();
-
-    Tensor x_matrix, y_matrix;
-
-    if (x->dims().size() > 2) {
-      x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims);
-    } else {
-      x_matrix = *x;
-    }
-
-    if (y->dims().size() > 2) {
-      y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims);
-
-    } else {
-      y_matrix = *y;
-    }
-
-    auto* dout = &param.output_grad->raw_tensor();
-
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize(
-        {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
-         framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
-
-    auto* dx = &param.x_grad->raw_tensor();
-    auto* dy = &param.y_grad->raw_tensor();
-
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        *context.x86_device_context());
-    if (dx) {
-      // dx->mutable_data<T>(context.x86_device_context->GetPlace());
-      param.x_grad->template mutable_data<T>();
-      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dx, param.x_num_col_dims)
-                                               : *dx;
-
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-    }
-    if (dy) {
-      // dy->yutable_data<T>(context.x86_device_context->GetPlace());
-      param.y_grad->template mutable_data<T>();
-      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dy, param.y_num_col_dims)
-                                               : *dy;
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-    }
-  }
-
-  virtual ~MulGradCompute() = default;
-};
-#endif
-
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/x86/pool_compute.h b/lite/kernels/x86/pool_compute.h
index 57bcddcec9512d626962465e717b7a202cfe0b17..0dccb245b1267ac7ffa7c75bda9b491ffc3cd191 100644
--- a/lite/kernels/x86/pool_compute.h
+++ b/lite/kernels/x86/pool_compute.h
@@ -35,7 +35,6 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& param = *param_.get_mutable<param_t>();
     if (param.global_pooling) {
       for (size_t i = 0; i < param.ksize.size(); ++i) {
-        param.paddings[i] = 0;
         param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
       }
     }
@@ -52,7 +51,7 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
                          param.x,
                          param.ksize,
                          param.strides,
-                         param.paddings,
+                         *param.paddings,
                          pool_process,
                          true,
                          false,
@@ -68,7 +67,7 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
                          param.x,
                          param.ksize,
                          param.strides,
-                         param.paddings,
+                         *param.paddings,
                          pool_process,
                          param.exclusive,
                          param.adaptive,
diff --git a/lite/kernels/x86/pool_compute_test.cc b/lite/kernels/x86/pool_compute_test.cc
index 87b75a0760bca45057f25b2cb948a66feb22496c..4ea727cedd5206f5f1ac2685297f72c3019bb313 100644
--- a/lite/kernels/x86/pool_compute_test.cc
+++ b/lite/kernels/x86/pool_compute_test.cc
@@ -60,7 +60,8 @@ TEST(pool2d_x86, run_test) {
   param.x = &x;
   param.output = &out;
   param.strides = {2, 2};
-  param.paddings = {0, 0};
+  std::vector<int> paddings = {0, 0, 0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
   param.ksize = {2, 2};
   param.pooling_type = "max";
   std::unique_ptr<KernelContext> ctx(new KernelContext);
diff --git a/lite/kernels/x86/reduce_compute.h b/lite/kernels/x86/reduce_compute.h
index 655f104ce65906f1904a7cf02d703069b0a7a2bf..f93157c837995792772c86d969312bfa28341ce4 100644
--- a/lite/kernels/x86/reduce_compute.h
+++ b/lite/kernels/x86/reduce_compute.h
@@ -51,7 +51,7 @@ class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto* output = param.output;
     param.output->mutable_data<T>();
 
-    auto dims = param.dim;
+    const auto& dims = param.dim;
     bool keep_dim = param.keep_dim;
     if (reduce_all) {
       // Flatten and reduce 1-D tensor
diff --git a/lite/kernels/x86/reduce_op_function.h b/lite/kernels/x86/reduce_op_function.h
index b3ddab64e4bf8dc72cec3b86398f42269c5a947c..179a06164dc4aa73683ba8803bce1f7733bae141 100644
--- a/lite/kernels/x86/reduce_op_function.h
+++ b/lite/kernels/x86/reduce_op_function.h
@@ -47,33 +47,35 @@ void ReduceFunctor(const lite::Tensor& input,
                    const std::vector<int>& dims,
                    bool keep_dim) {
   auto x = EigenTensor<T, D>::From(input);
-  auto x_rank = static_cast<int>(x.dimensions().size());
+
   auto reduce_dim = Eigen::array<int, R_D>();
-  std::vector<int> dims_ref = dims;
-  for (size_t i = 0; i < dims_ref.size(); ++i) {
-    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
-    reduce_dim[i] = dims_ref[i];
-  }
-  // construct the squeezed output tensor
-  lite::DDim out_dims = output->dims();
-  if (keep_dim && x_rank > 1) {
-    const int kDelFlag = -2;
-    auto dims_vector = out_dims.Vectorize();
-    for (size_t i = 0; i < dims_ref.size(); ++i) {
-      dims_vector[dims_ref[i]] = kDelFlag;
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  for (size_t i = 0; i < dims.size(); ++i) {
+    if (dims[i] < 0) {
+      reduce_dim[i] = x_rank + dims[i];
+    } else {
+      reduce_dim[i] = dims[i];
     }
-    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-                      dims_vector.end());
-    out_dims = lite::DDim(dims_vector);
   }
-  // auto& place = *context.eigen_device();
-  Functor functor;
 
+  Functor functor;
   if (D == 1) {
     auto out = EigenScalar<T>::From(output);
     functor(&x, &out, reduce_dim);
   } else {
-    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
+    std::vector<DDim::value_type> out_dims;
+    if (keep_dim) {
+      // Construct the squeezed dims.
+      const int kDelFlag = -2;
+      out_dims = output->dims().Vectorize();
+      for (size_t i = 0; i < dims.size(); ++i) {
+        out_dims[reduce_dim[i]] = kDelFlag;
+      }
+      out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                     out_dims.end());
+    }
+    auto out = EigenTensor<T, (D - R_D)>::From(
+        *output, keep_dim ? DDim(out_dims) : output->dims());
     functor(&x, &out, reduce_dim);
   }
 }
diff --git a/lite/kernels/x86/reshape_compute.h b/lite/kernels/x86/reshape_compute.h
index 948c4ec31d7b3a7cf16f23582e6e17ea54dd081c..b06eb6eb67e281265a47e33a5d81ed290cf38ffe 100644
--- a/lite/kernels/x86/reshape_compute.h
+++ b/lite/kernels/x86/reshape_compute.h
@@ -28,8 +28,9 @@ namespace x86 {
 
 template <typename T>
 void Compute(const lite::Tensor* in, lite::Tensor* out) {
+  // In CopyDataFrom, the target tensor's dims will be set to the source
+  // tensor's dims.
   auto out_dims = out->dims();
-  auto in_dims = in->dims();
   out->CopyDataFrom(*in);
   out->Resize(out_dims);
 }
diff --git a/lite/kernels/x86/search_aligned_mat_mul_compute.cc b/lite/kernels/x86/search_aligned_mat_mul_compute.cc
index df88ca6867b1db340dbd343d6ff792d7dfb7b6a6..956f2a3beb8ae845b71c31600fdf8e6c758cab6a 100644
--- a/lite/kernels/x86/search_aligned_mat_mul_compute.cc
+++ b/lite/kernels/x86/search_aligned_mat_mul_compute.cc
@@ -24,4 +24,7 @@ REGISTER_LITE_KERNEL(
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
diff --git a/lite/kernels/x86/search_fc_compute.h b/lite/kernels/x86/search_fc_compute.h
index 0e61924151dd9a67ea23dbbd9d35187b458ec638..e0f44de526be102ac7be4f44517d01e0bc28ff94 100644
--- a/lite/kernels/x86/search_fc_compute.h
+++ b/lite/kernels/x86/search_fc_compute.h
@@ -31,6 +31,7 @@ class SearchFcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& context = ctx_->As<X86Context>();
     auto& param = *param_.get_mutable<param_t>();
 
+    param.Out->Resize({param.X->dims()[0], param.out_size});
     lite::x86::math::SearchFcFunctor<lite::TargetType::kX86, T> search_fc;
     search_fc(context, *param.X, *param.W, *param.b, param.Out, param.out_size);
   }
diff --git a/lite/kernels/x86/search_grnn_compute.h b/lite/kernels/x86/search_grnn_compute.h
index 5082faf45b79379498ec51ae671af9c63c7a9889..66866761e139479863d98dd757d1a90ae36de9f5 100644
--- a/lite/kernels/x86/search_grnn_compute.h
+++ b/lite/kernels/x86/search_grnn_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 
+#include "lite/backends/x86/math/blas.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
diff --git a/lite/kernels/x86/search_grnn_compute_test.cc b/lite/kernels/x86/search_grnn_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b85d97e3f1be1f2f02837d347e42ce6731c58414
--- /dev/null
+++ b/lite/kernels/x86/search_grnn_compute_test.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_grnn_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(search_grnn_x86, retrive_op) {
+  auto kernel =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "search_grnn");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(search_grnn_x86, init) {
+  SearchGrnnCompute<float> ssdc;
+  ASSERT_EQ(ssdc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(ssdc.target(), TARGET(kX86));
+}
+
+TEST(search_grnn_x86, run_test) {
+  int num_input = 128;
+  int num_hidden = 128;
+  int num_batch = 3;
+  lite::Tensor x, wi, wh, out, idx_sorted_by_width, layout_input, tmp_buffer;
+  x.Resize({num_batch, num_input});
+  wi.Resize({3, num_hidden, num_input});
+  wh.Resize({3, num_hidden, num_hidden});
+  // out.Resize({num_batch, num_hidden});
+  LoD x_lod{};
+  x_lod.push_back({0, 1, 3});
+  x.set_lod(x_lod);
+
+  auto* x_data = x.mutable_data<float>();
+  for (int64_t i = 0; i < x.numel(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  auto* wi_data = wi.mutable_data<float>();
+  for (int64_t i = 0; i < wi.numel(); i++) {
+    wi_data[i] = static_cast<float>(i);
+  }
+  auto* wh_data = wh.mutable_data<float>();
+  for (int64_t i = 0; i < wh.numel(); i++) {
+    wh_data[i] = static_cast<float>(i);
+  }
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+
+  operators::SearchGrnnParam param;
+  param.x = &x;
+  param.wi = &wi;
+  param.wh = &wh;
+  param.out = &out;
+  param.idx_sorted_by_width = &idx_sorted_by_width;
+  param.layout_input = &layout_input;
+  param.tmp_buffer = &tmp_buffer;
+  param.num_input = num_input;
+  param.num_hidden = num_hidden;
+
+  SearchGrnnCompute<float> sgc;
+  sgc.SetContext(std::move(ctx));
+  sgc.SetParam(param);
+  sgc.Run();
+
+  // std::vector<float> ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19};
+  auto* out_data = out.mutable_data<float>();
+  LOG(INFO) << out.numel();
+  for (int i = 0; i < out.numel(); i++) {
+    // EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_grnn, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/sequence_concat_compute.h b/lite/kernels/x86/sequence_concat_compute.h
index 553e2e8b0667106f25685a9ef155d7e61a672f31..8dd7077f7dbbb3e61f21d63e8c935157b3d2d579 100644
--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
@@ -52,7 +52,29 @@ class SequenceConcatCompute
 
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
-    // auto& param = Param<param_t>();
+
+    int64_t batch_size = 0;
+    int64_t feature_size = 0;
+    std::vector<int64_t> out_dims;
+    for (const auto& tensor : param.X) {
+      const auto x_dims = tensor->dims();
+      if (out_dims.empty()) {
+        out_dims = x_dims.Vectorize();
+      }
+      batch_size += x_dims[0];
+      if (feature_size == 0) {
+        feature_size = x_dims.production() / x_dims[0];
+      } else {
+        CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+            << "Inputs of sequence concat must have same feature size";
+      }
+    }
+    if (batch_size < 0) {
+      batch_size = -1;  // Normalize batch size for compile time.
+    }
+    out_dims[0] = batch_size;
+    param.Out->Resize(out_dims);
+
     T* dout = param.Out->mutable_data<T>();
 
     std::vector<lite::Tensor> x_in_order;
diff --git a/lite/kernels/x86/sequence_reshape_compute.h b/lite/kernels/x86/sequence_reshape_compute.h
index 68a573c2f674edcf0a09cccec730a8d7dbcea844..99f84ebd06e1f5742bbaee9f98ec17aee44bd871 100644
--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
@@ -36,11 +36,10 @@ class SequenceReshapeCompute
     auto* out = param.output;
     int out_width = param.new_dim;
 
-    auto in_dims = in->dims();
+    const auto& in_dims = in->dims();
     int64_t in_width = in_dims[1];
-    // LOG(INFO)<<"sequence_reshape in tensor:"<<*in;
-    auto& in_lod = in->lod();
 
+    auto& in_lod = in->lod();
     CHECK_EQ(in_lod.size(), 1UL);
     CHECK_EQ((uint64_t)in_dims[0], in_lod[0].back());
 
@@ -63,13 +62,11 @@ class SequenceReshapeCompute
       }
     }
 
-    out->Resize(in_dims);
+    out->Resize(std::vector<int64_t>{static_cast<int64_t>(out->lod()[0].back()),
+                                     out_width});
     auto* dst_ptr = out->mutable_data<T>();
     auto size = in->numel() * sizeof(T);
     std::memcpy(dst_ptr, in->data<T>(), size);
-    std::vector<int64_t> out_shape{static_cast<int64_t>(out->lod()[0].back()),
-                                   out_width};
-    out->Resize(lite::DDim(out_shape));
   }
 
   virtual ~SequenceReshapeCompute() = default;
diff --git a/lite/kernels/x86/sequence_reverse_compute.cc b/lite/kernels/x86/sequence_reverse_compute.cc
index 7d4cb8402f8ebdbd386d22730eb918d6669cdbd7..6c391e12ad1df671517c182509e415325bb8ce56 100644
--- a/lite/kernels/x86/sequence_reverse_compute.cc
+++ b/lite/kernels/x86/sequence_reverse_compute.cc
@@ -14,12 +14,19 @@
 
 #include "lite/kernels/x86/sequence_reverse_compute.h"
 
-REGISTER_LITE_KERNEL(sequence_reverse,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::SequenceReverseCompute<float>,
-                     def)
+typedef paddle::lite::kernels::x86::SequenceReverseCompute<float,
+                                                           PRECISION(kFloat)>
+    ReverseFp32;
+typedef paddle::lite::kernels::x86::SequenceReverseCompute<int64_t,
+                                                           PRECISION(kInt64)>
+    ReverseInt64;
+
+REGISTER_LITE_KERNEL(sequence_reverse, kX86, kFloat, kNCHW, ReverseFp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_reverse, kX86, kInt64, kNCHW, ReverseInt64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_reverse_compute.h b/lite/kernels/x86/sequence_reverse_compute.h
index 85072e80107fd7c17f8fe97a24efc5e7046ea481..ab93972276664acc8585bd150a53601c039ccf87 100644
--- a/lite/kernels/x86/sequence_reverse_compute.h
+++ b/lite/kernels/x86/sequence_reverse_compute.h
@@ -22,18 +22,17 @@ namespace lite {
 namespace kernels {
 namespace x86 {
 
-template <typename T>
-class SequenceReverseCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+template <typename T, PrecisionType Ptype>
+class SequenceReverseCompute : public KernelLite<TARGET(kX86), Ptype> {
  public:
   using param_t = operators::SequenceReverseParam;
 
   void Run() override {
-    auto& param = *param_.get_mutable<operators::SequenceReverseParam>();
+    auto& param = this->template Param<param_t>();
     auto* output = param.Out;
-    const auto* din = param.X->data<T>();
+    const auto* din = param.X->template data<T>();
 
-    T* dout = output->mutable_data<T>();
+    T* dout = output->template mutable_data<T>();
     CHECK_NE(din, dout)
         << "SequenceReverse Op does not support in-place operation";
     const auto lod = param.X->lod()[param.X->lod().size() - 1];
diff --git a/lite/kernels/x86/sequence_reverse_compute_test.cc b/lite/kernels/x86/sequence_reverse_compute_test.cc
index 46eab429529849b6a8075fbfcf3828f02f61a06e..4b84241c8b19e3db57dd7ef6339496191a7486be 100644
--- a/lite/kernels/x86/sequence_reverse_compute_test.cc
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
@@ -52,13 +52,13 @@ TEST(sequence_reverse_x86, retrive_op) {
 }
 
 TEST(sequence_reverse_x86, init) {
-  SequenceReverseCompute<float> sequence_reverse;
+  SequenceReverseCompute<float, PRECISION(kFloat)> sequence_reverse;
   ASSERT_EQ(sequence_reverse.precision(), PRECISION(kFloat));
   ASSERT_EQ(sequence_reverse.target(), TARGET(kX86));
 }
 
 TEST(sequence_reverse_x86, run_test) {
-  SequenceReverseCompute<float> seq_kernel;
+  SequenceReverseCompute<float, PRECISION(kFloat)> seq_kernel;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
 
   operators::SequenceReverseParam param;
diff --git a/lite/kernels/x86/softmax_compute.cc b/lite/kernels/x86/softmax_compute.cc
index 3fe7b162a3ba6b96d1e384632d7c0175802e6264..3a2cdc29ed262740aec0efca9460800f57f43437 100644
--- a/lite/kernels/x86/softmax_compute.cc
+++ b/lite/kernels/x86/softmax_compute.cc
@@ -31,4 +31,5 @@ REGISTER_LITE_KERNEL(search_seq_softmax,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
diff --git a/lite/kernels/x86/softmax_compute.h b/lite/kernels/x86/softmax_compute.h
index 169644db05e2fc9b83b11e068e03d6a44d5d06b7..5a18a8022773682c0853a3592a9925f3a6015e83 100644
--- a/lite/kernels/x86/softmax_compute.h
+++ b/lite/kernels/x86/softmax_compute.h
@@ -29,7 +29,7 @@ static inline int CanonicalAxis(const int axis, const int rank) {
   return axis;
 }
 
-static inline int SizeToAxis(const int axis, lite::DDim dims) {
+static inline int SizeToAxis(const int axis, const DDim& dims) {
   int size = 1;
   for (int i = 0; i < axis; i++) {
     size *= dims[i];
@@ -37,7 +37,7 @@ static inline int SizeToAxis(const int axis, lite::DDim dims) {
   return size;
 }
 
-static inline int SizeFromAxis(const int axis, lite::DDim dims) {
+static inline int SizeFromAxis(const int axis, const DDim& dims) {
   int size = 1;
   for (size_t i = axis; i < dims.size(); i++) {
     size *= dims[i];
@@ -55,22 +55,33 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& context = ctx_->As<X86Context>();
     CHECK(param.output);
     CHECK(param.x);
-    param.output->mutable_data<T>();
-    const int rank = param.x->dims().size();
+
+    auto* x = param.x;
+    auto* output = param.output;
+    output->mutable_data<T>();
+
+    const int rank = x->dims().size();
     const int axis = CanonicalAxis(param.axis, rank);
-    int axis_dim = param.x->dims()[axis];
-    const int n = SizeToAxis(axis, param.x->dims());
-    const int d = SizeFromAxis(axis, param.x->dims());
-    std::vector<int64_t> shape{n, d};
+    int axis_dim = x->dims()[axis];
+    if (rank == 2 && axis == 1) {
+      lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
+          context, axis_dim, x, output);
+    } else {
+      const int n = SizeToAxis(axis, x->dims());
+      const int d = SizeFromAxis(axis, x->dims());
+
+      DDim x_dims = x->dims();
+      DDim out_dims = output->dims();
 
-    lite::Tensor input_2d, out_2d;
-    input_2d.ShareDataWith(*param.x);
-    input_2d.Resize(lite::DDim(shape));
-    out_2d.ShareDataWith(*param.output);
-    out_2d.Resize(lite::DDim(shape));
+      DDim shape_2d(std::vector<DDim::value_type>{n, d});
+      x->Resize(shape_2d);
+      output->Resize(shape_2d);
 
-    lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
-        context, axis_dim, &input_2d, &out_2d);
+      lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
+          context, axis_dim, x, output);
+      x->Resize(x_dims);
+      output->Resize(out_dims);
+    }
   }
 
   virtual ~SoftmaxCompute() = default;
diff --git a/lite/kernels/x86/stack_compute.cc b/lite/kernels/x86/stack_compute.cc
index 5f69319a6ca44a7f1a191df16db6b9b6c29553ac..93479b02eee414614f71bd1d4513202efe8792d5 100644
--- a/lite/kernels/x86/stack_compute.cc
+++ b/lite/kernels/x86/stack_compute.cc
@@ -21,5 +21,5 @@ REGISTER_LITE_KERNEL(stack,
                      paddle::lite::kernels::x86::StackCompute<float>,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
diff --git a/lite/kernels/x86/var_conv_2d_compute.h b/lite/kernels/x86/var_conv_2d_compute.h
index c94cb2ca2d43a138b5769653d6cad2d52d420563..7a9ba16d2ea87adb40df23e1fbe149ab805afbc8 100644
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
@@ -44,6 +44,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     // 2-D lod info.
     // const auto& offset_x = in_col->lod()[0];
     // const auto& offset_y = in_row->lod()[0];
+    CHECK_EQ(param.X->lod().size(), 3) << "input lod size should be 3!";
     const auto& offset_y = param.X->lod()[1];
     const auto& offset_x = param.X->lod()[2];
 
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index 72c48ceab079bc65e4f2363a1702de52586733d6..d9c6de358650d5bc84e12762198988c0e46e34bf 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -1,9 +1,4 @@
 
-if(NOT LITE_WITH_XPU)
-  return ()
-endif()
-
-add_kernel(graph_compute_xpu XPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} xpu_runtime)
-# lite_cc_test(test_graph_compute_xpu SRCS graph_compute_test.cc DEPS graph_compute_xpu)
-
 add_subdirectory(bridges)
+
+add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
diff --git a/lite/kernels/xpu/bridges/CMakeLists.txt b/lite/kernels/xpu/bridges/CMakeLists.txt
index 47724728dfdb270ae5beb85852af6037735fda71..29cb83b2b853d4953bfbe7faca8633f2789e1d50 100644
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
@@ -1,32 +1,53 @@
-lite_cc_library(xpu_bridge_registry SRCS registry.cc)
+if(NOT LITE_WITH_XPU)
+  return()
+endif()
 
-set(xpu_bridge_deps xpu_bridge_registry xpu_builder op)
+lite_cc_library(subgraph_bridge_utility_xpu SRCS utility.cc DEPS ${xpu_builder_libs} tensor)
+lite_cc_library(subgraph_bridge_graph_xpu SRCS graph.cc DEPS subgraph_bridge_utility_xpu)
 
-lite_cc_library(xpu_bridge_act_op SRCS act_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_conv_op SRCS conv_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_elementwise_ops SRCS elementwise_ops.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_pool_op SRCS pool_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_softmax_op SRCS softmax_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_mul_op SRCS mul_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_batch_norm_op SRCS batch_norm_op.cc DEPS ${xpu_bridge_deps})
+set(xpu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_xpu subgraph_bridge_graph_xpu)
 
-set(xpu_bridges
-        xpu_bridge_registry
-        xpu_bridge_act_op
-        xpu_bridge_conv_op
-        xpu_bridge_elementwise_ops
-        xpu_bridge_pool_op
-        xpu_bridge_softmax_op
-        xpu_bridge_mul_op
-        xpu_bridge_batch_norm_op
-        CACHE INTERNAL "xpu_bridges")
+lite_cc_library(subgraph_bridge_act_op_xpu SRCS act_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_op_xpu SRCS conv_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_xpu SRCS elementwise_ops.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_xpu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_xpu})
+lite_cc_library(subgraph_bridge_softmax_op_xpu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_xpu})
+lite_cc_library(subgraph_bridge_mul_op_xpu SRCS mul_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_xpu SRCS batch_norm_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_stack_op_xpu SRCS stack_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_gather_op_xpu SRCS gather_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_scale_op_xpu SRCS scale_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_lookup_table_op_xpu SRCS lookup_table_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_slice_op_xpu SRCS slice_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_transpose_op_xpu SRCS transpose_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_reshape_op_xpu SRCS reshape_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_layer_norm_op_xpu SRCS layer_norm_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_dropout_op_xpu SRCS dropout_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_matmul_op_xpu SRCS matmul_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_cast_op_xpu SRCS cast_op.cc DEPS ${xpu_subgraph_bridge_deps})
 
-set(xpu_bridge_test_deps ${xpu_bridges} ${xpu_kernels} ${ops})
+set(xpu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_xpu
+        subgraph_bridge_graph_xpu
+        subgraph_bridge_act_op_xpu
+        subgraph_bridge_conv_op_xpu
+        subgraph_bridge_elementwise_ops_xpu
+        subgraph_bridge_pool_op_xpu
+        subgraph_bridge_softmax_op_xpu
+        subgraph_bridge_mul_op_xpu
+        subgraph_bridge_batch_norm_op_xpu
+        subgraph_bridge_stack_op_xpu
+        subgraph_bridge_gather_op_xpu
+        subgraph_bridge_scale_op_xpu
+        subgraph_bridge_lookup_table_op_xpu
+        subgraph_bridge_slice_op_xpu
+        subgraph_bridge_transpose_op_xpu
+        subgraph_bridge_reshape_op_xpu
+        subgraph_bridge_layer_norm_op_xpu
+        subgraph_bridge_dropout_op_xpu
+        subgraph_bridge_matmul_op_xpu
+        subgraph_bridge_cast_op_xpu
+        CACHE INTERNAL "xpu_subgraph_bridges")
 
-lite_cc_test(test_xpu_bridge_act_op SRCS act_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_elementwise_ops SRCS elementwise_ops_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_pool_op SRCS pool_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_softmax_op SRCS softmax_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_mul_op SRCS mul_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_batch_norm_op SRCS batch_norm_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
+message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}")
diff --git a/lite/kernels/xpu/bridges/act_op.cc b/lite/kernels/xpu/bridges/act_op.cc
index d8e11caa96fdbff3a853a192a8d16f2eccd96337..e3d4588aa2aed1268a8e15f654019031a5202542 100644
--- a/lite/kernels/xpu/bridges/act_op.cc
+++ b/lite/kernels/xpu/bridges/act_op.cc
@@ -12,51 +12,65 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type ActConverter(const std::shared_ptr<lite::OpLite> op,
-                           graph_ctx_type* graph_ctx,
-                           const node_map_type& input_nodes) {
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
-
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-
-  // create act node and set params from op
-  auto x_var_name = op_info->Input("X").front();
-  CHECK(input_nodes.count(x_var_name));
-  std::shared_ptr<xtcl::xExpr> act_node = nullptr;
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Act node
   if (op_type == "relu") {
-    act_node = std::make_shared<xtcl::xExpr>(
-        graph_ctx->builder->CreateRelu(*input_nodes.at(x_var_name)));
+    graph->Add(out_name, graph->builder_.CreateRelu(*x_node->data()));
+  } else if (op_type == "tanh") {
+    graph->Add(out_name,
+               graph->builder_.CreateUnaryOp("tanh", *x_node->data()));
+  } else if (op_type == "gelu") {
+    graph->Add(out_name, graph->builder_.CreateGelu(*x_node->data()));
   } else {
     // TODO(hong19860320) supports more activation ops
-    LOG(FATAL) << "[XPU] Unsupported activation type " << op_type;
+    LOG(WARNING) << "[XPU] Unsupported activation type " << op_type;
+    return FAILED;
   }
-  graph_ctx->builder->SetLayer(unique_op_type);
-
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = act_node;
-  return output_nodes;
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(relu, paddle::lite::kernels::xpu::bridges::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(relu, kXPU, paddle::lite::subgraph::xpu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(tanh, kXPU, paddle::lite::subgraph::xpu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(gelu, kXPU, paddle::lite::subgraph::xpu::ActConverter);
diff --git a/lite/kernels/xpu/bridges/act_op_test.cc b/lite/kernels/xpu/bridges/act_op_test.cc
deleted file mode 100644
index 1a3efab46e3c7caee08bf646a560a0ab9abcf5c7..0000000000000000000000000000000000000000
--- a/lite/kernels/xpu/bridges/act_op_test.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/test_helper.h"
-#include "lite/operators/activation_ops.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-void relu_ref(const std::shared_ptr<operators::ActivationOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->data<float>();
-  auto out_data = out->mutable_data<float>();
-  DDim x_dims = x->dims();
-  DDim out_dims = out->dims();
-  CHECK_EQ(x_dims.production(), out_dims.production());
-  for (int i = 0; i < out_dims.production(); i++) {
-    out_data[i] = std::max(0.f, x_data[i]);
-  }
-}
-
-void test_relu(int bs, int ic, int ih, int iw) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("relu");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-
-  // create and convert op to XPU model, and run it on XPU
-  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  relu_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, relu) {
-  for (auto bs : {1, 3}) {
-    for (auto ic : {3, 4}) {
-      for (auto ih : {2, 5}) {
-        for (auto iw : {5, 9}) {
-          VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
-                  << " iw: " << iw;
-          test_relu(bs, ic, ih, iw);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(relu);
-USE_XPU_BRIDGE(relu);
diff --git a/lite/kernels/xpu/bridges/batch_norm_op.cc b/lite/kernels/xpu/bridges/batch_norm_op.cc
index 0c46b7878cdbb6987a11215d4dfcb80a2672aad2..d84b9cc4f190432166575cd689e839af0d0e0b12 100644
--- a/lite/kernels/xpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/xpu/bridges/batch_norm_op.cc
@@ -12,102 +12,94 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type BatchNormConverter(const std::shared_ptr<lite::OpLite> op,
-                                 graph_ctx_type* graph_ctx,
-                                 const node_map_type& input_nodes) {
-  auto scope = op->scope();
+int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
 
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
 
-  // get input, and attributes
-  auto x_var_name = op_info->Input("X").front();
-  auto scale_var_name = op_info->Input("Scale").front();
-  auto* scale = scope->FindMutableTensor(scale_var_name);
-  auto bias_var_name = op_info->Input("Bias").front();
-  auto* bias = scope->FindMutableTensor(bias_var_name);
-  auto mean_var_name = op_info->Input("Mean").front();
-  auto* mean = scope->FindMutableTensor(mean_var_name);
-  auto variance_var_name = op_info->Input("Variance").front();
-  auto* variance = scope->FindMutableTensor(variance_var_name);
-  auto epsilon = op_info->GetAttr<float>("epsilon");
+  auto scale_name = op_info->Input("Scale").front();
+  auto scale_type = kernel->GetInputDeclType("Scale");
+  CHECK(scale_type->precision() == PRECISION(kFloat));
+  CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
+  auto scale = scope->FindMutableTensor(scale_name);
 
-  // create scale node
-  CHECK(!input_nodes.count(scale_var_name));
-  auto scale_const_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateTensor(scale_var_name,
-                                       lite::xpu::CvtShape(scale->dims()),
-                                       ::xtcl::Float(32)));
-  auto scale_const_tensor = lite::xpu::CvtTensor(scale);
-  graph_ctx->params->emplace(
-      std::make_pair(scale_var_name, *scale_const_tensor));
+  auto bias_name = op_info->Input("Bias").front();
+  auto bias_type = kernel->GetInputDeclType("Bias");
+  CHECK(bias_type->precision() == PRECISION(kFloat));
+  CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+  auto bias = scope->FindMutableTensor(bias_name);
 
-  // create bias node
-  CHECK(!input_nodes.count(bias_var_name));
-  auto bias_const_node =
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-          bias_var_name, lite::xpu::CvtShape(bias->dims()), ::xtcl::Float(32)));
-  auto bias_const_tensor = lite::xpu::CvtTensor(bias);
-  graph_ctx->params->emplace(std::make_pair(bias_var_name, *bias_const_tensor));
+  auto mean_name = op_info->Input("Mean").front();
+  auto mean_type = kernel->GetInputDeclType("Mean");
+  CHECK(mean_type->precision() == PRECISION(kFloat));
+  CHECK(mean_type->layout() == DATALAYOUT(kNCHW));
+  auto mean = scope->FindMutableTensor(mean_name);
 
-  // create mean node
-  CHECK(!input_nodes.count(mean_var_name));
-  auto mean_const_node =
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-          mean_var_name, lite::xpu::CvtShape(mean->dims()), ::xtcl::Float(32)));
-  auto mean_const_tensor = lite::xpu::CvtTensor(mean);
-  graph_ctx->params->emplace(std::make_pair(mean_var_name, *mean_const_tensor));
+  auto variance_name = op_info->Input("Variance").front();
+  auto variance_type = kernel->GetInputDeclType("Variance");
+  CHECK(variance_type->precision() == PRECISION(kFloat));
+  CHECK(variance_type->layout() == DATALAYOUT(kNCHW));
+  auto variance = scope->FindMutableTensor(variance_name);
+
+  auto y_name = op_info->Output("Y").front();
+  auto y_type = kernel->GetOutputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+
+  auto epsilon = op_info->GetAttr<float>("epsilon");
 
-  // create variance node
-  CHECK(!input_nodes.count(variance_var_name));
-  auto variance_const_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateTensor(variance_var_name,
-                                       lite::xpu::CvtShape(variance->dims()),
-                                       ::xtcl::Float(32)));
-  auto variance_const_tensor = lite::xpu::CvtTensor(variance);
-  graph_ctx->params->emplace(
-      std::make_pair(variance_var_name, *variance_const_tensor));
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
 
-  // create batch_norm node and set params from op
-  CHECK(input_nodes.count(x_var_name));
-  auto batch_norm_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateBatchNorm(*input_nodes.at(x_var_name),
-                                          *scale_const_node,
-                                          *bias_const_node,
-                                          *mean_const_node,
-                                          *variance_const_node,
-                                          1,
-                                          epsilon));
-  batch_norm_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->GetField(*batch_norm_node, 0));
-  graph_ctx->builder->SetLayer(unique_op_type);
+  // Scale, Bias, Mean, Variance node
+  auto scale_node = graph->Add(scale_name, *scale);
+  auto bias_node = graph->Add(bias_name, *bias);
+  auto mean_node = graph->Add(mean_name, *mean);
+  auto variance_node = graph->Add(variance_name, *variance);
 
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Y").front()] = batch_norm_node;
-  return output_nodes;
+  // Batch Norm node and extract the first field as the output node
+  auto batch_norm_data = graph->builder_.CreateBatchNorm(*x_node->data(),
+                                                         *scale_node->data(),
+                                                         *bias_node->data(),
+                                                         *mean_node->data(),
+                                                         *variance_node->data(),
+                                                         1,
+                                                         epsilon);
+  graph->Add(y_name, graph->builder_.GetField(batch_norm_data, 0));
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(batch_norm,
-                    paddle::lite::kernels::xpu::bridges::BatchNormConverter);
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::BatchNormConverter);
diff --git a/lite/kernels/xpu/bridges/batch_norm_op_test.cc b/lite/kernels/xpu/bridges/batch_norm_op_test.cc
deleted file mode 100644
index dec475530a5bb5c692946bc8d185ea81990a6408..0000000000000000000000000000000000000000
--- a/lite/kernels/xpu/bridges/batch_norm_op_test.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/batch_norm_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-template <typename dtype>
-void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable<Tensor>();
-  auto bias =
-      scope->FindVar(op_info->Input("Bias").front())->GetMutable<Tensor>();
-  auto scale =
-      scope->FindVar(op_info->Input("Scale").front())->GetMutable<Tensor>();
-  auto mean =
-      scope->FindVar(op_info->Input("Mean").front())->GetMutable<Tensor>();
-  auto variance =
-      scope->FindVar(op_info->Input("Variance").front())->GetMutable<Tensor>();
-
-  auto x_data = x->data<dtype>();
-  auto y_data = y->mutable_data<dtype>();
-  auto scale_data = scale->mutable_data<dtype>();
-  auto bias_data = bias->mutable_data<dtype>();
-  auto mean_data = mean->mutable_data<dtype>();
-  auto variance_data = variance->mutable_data<dtype>();
-  DDim x_dims = x->dims();
-
-  float epsilon = op_info->GetAttr<float>("epsilon");
-  auto data_layout = op_info->GetAttr<std::string>("data_layout");
-
-  bool global_stats = op_info->GetAttr<bool>("use_global_stats");
-  if (global_stats) {
-    int64_t outer_size = 0;
-    int64_t channel_size = 0;
-    int64_t inner_size = 0;
-    if (data_layout == "NCHW") {
-      outer_size = x_dims[0];
-      channel_size = x_dims[1];
-      inner_size = x_dims.Slice(2, x_dims.size()).production();
-    } else {
-      LOG(FATAL) << "Unknown storage order: " << data_layout;
-    }
-    auto x_ptr = x_data;
-    auto y_ptr = y_data;
-    for (int o = 0; o < outer_size; o++) {
-      for (int c = 0; c < channel_size; c++) {
-        for (int i = 0; i < inner_size; i++) {
-          dtype norm_x =
-              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
-          *y_ptr = norm_x * scale_data[c] + bias_data[c];
-          x_ptr++;
-          y_ptr++;
-        }
-      }
-    }
-  }
-}
-
-void test_batch_norm(int bs, int ic, int ih, int iw, float epsilon) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  std::string scale_var_name = "scale";
-  std::string bias_var_name = "bias";
-  std::string mean_var_name = "mean";
-  std::string variance_var_name = "variance";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* scale = scope.Var(scale_var_name)->GetMutable<Tensor>();
-  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-  auto* mean = scope.Var(mean_var_name)->GetMutable<Tensor>();
-  auto* variance = scope.Var(variance_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  scale->Resize({ic});
-  bias->Resize({ic});
-  mean->Resize({ic});
-  variance->Resize({ic});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-  FillTensor<float>(scale);
-  FillTensor<float>(bias);
-  FillTensor<float>(mean);
-  // variance > 0
-  FillTensor<float>(variance, 1.f, 5.f);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("batch_norm");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetInput("Scale", {scale_var_name});
-  opdesc.SetInput("Bias", {bias_var_name});
-  opdesc.SetInput("Mean", {mean_var_name});
-  opdesc.SetInput("Variance", {variance_var_name});
-  opdesc.SetOutput("Y", {out_var_name});
-  opdesc.SetAttr("is_test", 1);
-  opdesc.SetAttr("use_global_stats", true);
-  opdesc.SetAttr("epsilon", epsilon);
-  opdesc.SetAttr("momentum", 0.9f);
-  opdesc.SetAttr("data_layout", std::string("NCHW"));
-
-  // create and convert op to XPU model, then run it on XPU
-  auto op = CreateOp<operators::BatchNormOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  batch_norm_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, batch_norm) {
-  for (auto bs : {1, 3}) {
-    for (auto ic : {2, 3}) {
-      for (auto ih : {4}) {
-        for (auto iw : {5}) {
-          for (auto epsilon : {1e-5f}) {
-            test_batch_norm(bs, ic, ih, iw, epsilon);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(batch_norm);
-USE_XPU_BRIDGE(batch_norm);
diff --git a/lite/kernels/xpu/bridges/cast_op.cc b/lite/kernels/xpu/bridges/cast_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..056822feb54b3859afa49c75d9fc8ccb19a48520
--- /dev/null
+++ b/lite/kernels/xpu/bridges/cast_op.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+bool CvtDtype(int dtype, PrecisionType* ptype) {
+  switch (dtype) {
+    case 21:
+      *ptype = PRECISION(kInt8);
+      break;
+    case 1:
+      *ptype = PRECISION(kInt16);
+      break;
+    case 2:
+      *ptype = PRECISION(kInt32);
+      break;
+    case 3:
+      *ptype = PRECISION(kInt64);
+      break;
+    case 5:
+      *ptype = PRECISION(kFloat);
+      break;
+    default:
+      LOG(WARNING) << "[XPU] unsupported date type: " << dtype;
+      return false;
+  }
+  return true;
+}
+
+int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto out_name = op_info->Output("Out").front();
+
+  // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
+  // SIZE_T = 19;UINT8 = 20;INT8 = 21;
+  int in_dtype = op_info->GetAttr<int>("in_dtype");
+  PrecisionType in_ptype;
+  if (!CvtDtype(in_dtype, &in_ptype)) {
+    return FAILED;
+  }
+
+  int out_dtype = op_info->GetAttr<int>("out_dtype");
+  PrecisionType out_ptype;
+  if (!CvtDtype(out_dtype, &out_ptype)) {
+    return FAILED;
+  }
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    CHECK(x->precision() == in_ptype)
+        << "The data type of input tensor X should be "
+        << PrecisionToStr(in_ptype) << ", but received "
+        << PrecisionToStr(x->precision());
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Cast node
+  graph->Add(
+      out_name,
+      graph->builder_.CreateCast(*x_node->data(), CvtPrecisionType(out_ptype)));
+
+  return SUCCESS;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(cast,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::CastConverter);
diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc
index 2c758cf9507087fb53d476ff86a64707e0c6249b..fe9c598847977e87d87950c3850d3e1d074958b2 100644
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
@@ -12,31 +12,42 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/operators/conv_op.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
-                            graph_ctx_type* graph_ctx,
-                            const node_map_type& input_nodes) {
-  auto scope = op->scope();
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " << op_type << "... ";
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " << op_type << "... ";
 
-  // get input, filter and op attributes
-  auto input_var_name = op_info->Input("Input").front();
-  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  // Get input and output vars and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input_type = kernel->GetInputDeclType("Input");
+  CHECK(input_type->precision() == PRECISION(kFloat));
+  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
+  auto input = scope->FindMutableTensor(input_name);
   auto input_dims = input->dims();
-  auto filter_var_name = op_info->Input("Filter").front();
-  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter_type = kernel->GetInputDeclType("Filter");
+  CHECK(filter_type->precision() == PRECISION(kFloat));
+  CHECK(filter_type->layout() == DATALAYOUT(kNCHW));
+  auto filter = scope->FindMutableTensor(filter_name);
   auto filter_dims = filter->dims();
+  auto output_name = op_info->Output("Output").front();
+  auto output_type = kernel->GetOutputDeclType("Output");
+  CHECK(output_type->precision() == PRECISION(kFloat));
+  CHECK(output_type->layout() == DATALAYOUT(kNCHW));
   auto bs = input_dims[0];
   auto oc = filter_dims[0];
   CHECK_EQ(input_dims.size(), 4);
@@ -46,37 +57,55 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
   auto groups = op_info->GetAttr<int>("groups");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  CHECK_EQ(strides.size(), 2);
-  CHECK_EQ(paddings.size(), 2);
-  CHECK_EQ(dilations.size(), 2);
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  // Input node
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "Paddings size should be the same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
   std::vector<int64_t> output_shape({bs, oc});
   for (size_t i = 0; i < 2; i++) {
     const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
     output_shape.push_back(
-        (input_dims[i + 2] + 2 * paddings[i] - dkernel) / strides[i] + 1);
+        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
+            strides[i] +
+        1);
   }
   DDim output_dims(output_shape);
 
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-
-  // create filter node
-  CHECK(!input_nodes.count(filter_var_name));
-  auto filter_const_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateTensor(filter_var_name,
-                                       lite::xpu::CvtShape(filter_dims),
-                                       ::xtcl::Float(32)));
-  auto filter_const_tensor = lite::xpu::CvtTensor(filter);
-  graph_ctx->params->emplace(
-      std::make_pair(filter_var_name, *filter_const_tensor));
+  // Filter node
+  auto filter_node = graph->Add(filter_name, *filter);
 
-  // create conv node and set input, filter, bias nodes and attributes
+  // Conv node
   auto conv_attrs = xtcl::make_node<xtcl::network::Conv2DAttrs>();
-  conv_attrs->strides = std::move(lite::xpu::CvtShape(strides));
-  conv_attrs->padding = std::move(lite::xpu::CvtShape(paddings));
-  conv_attrs->dilation = std::move(lite::xpu::CvtShape(dilations));
+  conv_attrs->strides = std::move(CvtShape<xtcl::xIndexExpr>(strides));
+  conv_attrs->padding = std::move(CvtShape<xtcl::xIndexExpr>(paddings));
+  conv_attrs->dilation = std::move(CvtShape<xtcl::xIndexExpr>(dilations));
   conv_attrs->groups = groups;
   // conv_attrs->channels = nullptr;
   conv_attrs->kernel_size = std::move(xtcl::Array<xtcl::xIndexExpr>(nullptr));
@@ -84,20 +113,22 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
   conv_attrs->kernel_layout = "OIHW";
   conv_attrs->out_layout = "";
   // conv_attrs->out_dtype = "";
-  CHECK(input_nodes.count(input_var_name));
   auto conv_node =
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateConv2D(
-          *input_nodes.at(input_var_name), *filter_const_node, conv_attrs));
-  graph_ctx->builder->SetLayer(unique_op_type);
+      graph->Add(output_name,
+                 graph->builder_.CreateConv2D(
+                     *input_node->data(), *filter_node->data(), conv_attrs));
 
-  // create bias node if has bias
+  // Add bias node if exists bias
   // supports the bias nodes with the following dimensions
   // 0: {oc}
   // 1: {1, oc, oh, ow}
   // 2: {n, oc, oh, ow}
-  if (lite::xpu::HasInputArg(op_info, scope, "Bias")) {
-    auto bias_var_name = op_info->Input("Bias").front();
-    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias_type = kernel->GetInputDeclType("Bias");
+    CHECK(bias_type->precision() == PRECISION(kFloat));
+    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+    auto bias = scope->FindMutableTensor(bias_name);
     auto bias_dims = bias->dims();
     auto bias_data_size = bias_dims.production();
     auto output_data_size = output_dims.production();
@@ -114,57 +145,43 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
       // 2: {n, oc, oh, ow}
       bias_shape = output_dims.Vectorize();
     } else {
-      LOG(ERROR) << "bias dimension " << bias_dims
+      LOG(ERROR) << "[XPU] Bias dimension " << bias_dims
                  << " isn't supported in conv2d Op when output dimension is "
                  << output_dims;
     }
-    std::shared_ptr<xtcl::xExpr> bias_node = nullptr;
-    if (input_nodes.count(bias_var_name)) {
-      // bias node from input node
-      bias_node = input_nodes.at(bias_var_name);
+    std::shared_ptr<Node> bias_node = nullptr;
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
     } else {
-      // bias node with const tensor
-      auto bias_const_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateTensor(bias_var_name,
-                                           lite::xpu::CvtShape(bias_shape),
-                                           ::xtcl::Float(32)));
-      auto bias_const_tensor = lite::xpu::CvtTensor(bias, bias_shape);
-      graph_ctx->params->emplace(
-          std::make_pair(bias_var_name, *bias_const_tensor));
-      bias_node = bias_const_node;
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
     }
-    std::shared_ptr<xtcl::xExpr> add_node = nullptr;
     if (is_channel_bias) {
-      add_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateBiasAdd(*conv_node, 1, *bias_node));
+      conv_node = graph->Add(output_name,
+                             graph->builder_.CreateBiasAdd(
+                                 *conv_node->data(), 1, *bias_node->data()));
     } else {
-      add_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateBinaryOp("add", *conv_node, *bias_node));
+      conv_node =
+          graph->Add(output_name,
+                     graph->builder_.CreateBinaryOp(
+                         "add", *conv_node->data(), *bias_node->data()));
     }
-    graph_ctx->builder->SetLayer(unique_op_type + "/add");
-    conv_node = add_node;
   }
 
-  // output converted nodes
-  node_map_type output_nodes;
   if (fuse_relu) {
-    // append relu node if fuse_relu is true
-    auto relu_node = std::make_shared<xtcl::xExpr>(
-        graph_ctx->builder->CreateRelu(*conv_node));
-    graph_ctx->builder->SetLayer(unique_op_type + "/relu");
-    output_nodes[op_info->Output("Output").front()] = relu_node;
-  } else {
-    output_nodes[op_info->Output("Output").front()] = conv_node;
+    // Append relu node if fuse_relu is true
+    graph->Add(output_name, graph->builder_.CreateRelu(*conv_node->data()));
   }
-  return output_nodes;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(conv2d, paddle::lite::kernels::xpu::bridges::ConvConverter);
-REGISTER_XPU_BRIDGE(depthwise_conv2d,
-                    paddle::lite::kernels::xpu::bridges::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::ConvConverter);
diff --git a/lite/kernels/xpu/bridges/conv_op_test.cc b/lite/kernels/xpu/bridges/conv_op_test.cc
index ebdb67bd0d2801a9036696f52790f7104279b0cb..70929ffcd596c299b6d8975c2bfbb8941fc67525 100644
--- a/lite/kernels/xpu/bridges/conv_op_test.cc
+++ b/lite/kernels/xpu/bridges/conv_op_test.cc
@@ -54,7 +54,7 @@ void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
   int stride_h = strides[0];
   int dila_w = dilations[1];
   int dila_h = dilations[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int pad_h = paddings[0];
   int batch_size = input_dims[0];
   int in_ch_size = input_dims[1];
@@ -175,7 +175,8 @@ void test_conv(int bs,
   opdesc.SetOutput("Output", {output_var_name});
   opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
   opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int32_t>({padding, padding, padding, padding}));
   opdesc.SetAttr("groups", groups);
   opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
   if (has_bias) {
diff --git a/lite/kernels/xpu/bridges/dropout_op.cc b/lite/kernels/xpu/bridges/dropout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df869e17ff5626f03d6eb988a1687bb51c75d440
--- /dev/null
+++ b/lite/kernels/xpu/bridges/dropout_op.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
+  auto dropout_implementation =
+      op_info->GetAttr<std::string>("dropout_implementation");
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Dropout node
+  if (dropout_implementation == "downgrade_in_infer") {
+    graph->Add(out_name,
+               graph->builder_.CreateScale(
+                   *x_node->data(), 1.f - dropout_prob, 0.0f, false));
+  } else if (dropout_implementation == "upscale_in_train") {
+    graph->Add(out_name,
+               graph->builder_.CreateScale(*x_node->data(), 1.0f, 0.0f, false));
+  } else {
+    LOG(WARNING) << "[XPU] Unsupported dropout_implementation == "
+                 << dropout_implementation << " for dropout";
+    return FAILED;
+  }
+  return SUCCESS;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(dropout,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::DropoutConverter);
diff --git a/lite/kernels/xpu/bridges/elementwise_ops.cc b/lite/kernels/xpu/bridges/elementwise_ops.cc
index b9fe7db14d2dfd00a7e74c77d2fe3b84e9593f72..7fcae312b9776afa7e3b1cbd1bd17bd25b2e4aab 100644
--- a/lite/kernels/xpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/xpu/bridges/elementwise_ops.cc
@@ -12,85 +12,84 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type ElementwiseConverter(const std::shared_ptr<lite::OpLite> op,
-                                   graph_ctx_type* graph_ctx,
-                                   const node_map_type& input_nodes) {
-  auto scope = op->scope();
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(op != nullptr);
+  CHECK(ctx != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
-
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
 
-  // get input, and attributes
-  auto x_var_name = op_info->Input("X").front();
-  auto y_var_name = op_info->Input("Y").front();
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto y_name = op_info->Input("Y").front();
+  auto y_type = kernel->GetInputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto axis = op_info->GetAttr<int>("axis");
-  auto x_tensor = scope->FindMutableTensor(x_var_name);
-  auto y_tensor = scope->FindMutableTensor(y_var_name);
-  auto x_dims = x_tensor->dims();
-  auto y_dims = y_tensor->dims();
 
-  // create x and y node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (input_nodes.count(x_var_name)) {
-    x_node = input_nodes.at(x_var_name);
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-        x_var_name, lite::xpu::CvtShape(x_dims), ::xtcl::Float(32)));
-    auto x_const_tensor = lite::xpu::CvtTensor(x_tensor);
-    graph_ctx->params->emplace(std::make_pair(x_var_name, *x_const_tensor));
+    x_node = graph->Add(x_name, *x);
   }
 
-  std::shared_ptr<xtcl::xExpr> y_node = nullptr;
-  if (input_nodes.count(y_var_name)) {
-    y_node = input_nodes.at(y_var_name);
+  // Y node
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
   } else {
-    y_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-        y_var_name, lite::xpu::CvtShape(y_dims), ::xtcl::Float(32)));
-    auto y_const_tensor = lite::xpu::CvtTensor(y_tensor);
-    graph_ctx->params->emplace(std::make_pair(y_var_name, *y_const_tensor));
+    y_node = graph->Add(y_name, *y);
   }
 
-  // create elementwise node and set input, attributes
-  std::shared_ptr<xtcl::xExpr> elementwise_node = nullptr;
+  // Elementwise node
+  std::shared_ptr<Node> elt_node = nullptr;
   if (y_dims.size() == 1) {
-    elementwise_node = std::make_shared<xtcl::xExpr>(
-        graph_ctx->builder->CreateBiasAdd(*x_node, axis, *y_node));
+    elt_node = graph->Add(
+        out_name,
+        graph->builder_.CreateBiasAdd(*x_node->data(), axis, *y_node->data()));
   } else if (x_dims.size() == y_dims.size()) {
-    elementwise_node = std::make_shared<xtcl::xExpr>(
-        graph_ctx->builder->CreateBinaryOp("add", *x_node, *y_node));
+    elt_node = graph->Add(out_name,
+                          graph->builder_.CreateBinaryOp(
+                              "add", *x_node->data(), *y_node->data()));
   } else {
-    LOG(ERROR) << "XPU elementwise_add only support y of one dimension, or x "
-                  "and y of the same dimension. But recieved x's dimension: "
-               << x_dims << ", y's dimension: " << y_dims << ", axis: " << axis;
+    LOG(WARNING)
+        << "[XPU] elementwise_add only support y of one dimension, or x "
+           "and y of the same dimension. But recieved x's dimension: "
+        << x_dims << ", y's dimension: " << y_dims << ", axis: " << axis;
+    return FAILED;
   }
-  graph_ctx->builder->SetLayer(unique_op_type);
-
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = elementwise_node;
-  return output_nodes;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(elementwise_add,
-                    paddle::lite::kernels::xpu::bridges::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::ElementwiseConverter);
diff --git a/lite/kernels/xpu/bridges/gather_op.cc b/lite/kernels/xpu/bridges/gather_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..845bbb8d98f5734b855178fd68880c5c901608bc
--- /dev/null
+++ b/lite/kernels/xpu/bridges/gather_op.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto index_name = op_info->Input("Index").front();
+  auto index_type = kernel->GetInputDeclType("Index");
+  CHECK(index_type->precision() == PRECISION(kInt32) ||
+        index_type->precision() == PRECISION(kInt64));
+  CHECK(index_type->layout() == DATALAYOUT(kNCHW));
+  auto index = scope->FindMutableTensor(index_name);
+  auto index_dims = index->dims();
+  CHECK(index_dims.size() == 1 ||
+        (index_dims.size() == 2 && index_dims[1] == 1));
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Index node
+  std::shared_ptr<Node> index_node = nullptr;
+  if (graph->Has(index_name)) {
+    index_node = graph->Get(index_name);
+  } else {
+    index_node = graph->Add(index_name, *index);
+  }
+  // Flatten index node
+  if (index_dims.size() != 1) {
+    index_node =
+        graph->Add(index_name + "/reshape",
+                   graph->builder_.CreateReshape(*index_node->data(), {-1}),
+                   index_node->precision(),
+                   index_node->layout());
+  }
+
+  // Reshape the gather node with the inferred shape as the output node
+  auto gather_node =
+      graph->Add(out_name,
+                 graph->builder_.CreateGather(
+                     *x_node->data(), *index_node->data(), /* axis= */ 0),
+                 x_node->precision(),
+                 x_node->layout());
+  if (out_dims.size() != 2) {
+    graph->Add(out_name,
+               graph->builder_.CreateReshape(*gather_node->data(),
+                                             CvtShape<xtcl::Integer>(out_dims)),
+               gather_node->precision(),
+               gather_node->layout());
+  }
+  return SUCCESS;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(gather,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::GatherConverter);
diff --git a/lite/kernels/xpu/bridges/graph.cc b/lite/kernels/xpu/bridges/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43aaad3402b7873dbaa67d4c4897b5378e098500
--- /dev/null
+++ b/lite/kernels/xpu/bridges/graph.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    // Only variable node can be shared with the same name
+    if (!node->is_var() || !it->second.back()->is_var()) {
+      LOG(FATAL) << "[XPU] Const or data node " << name << " is redefined.";
+      return -1;
+    }
+  } else {
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  it->second.push_back(node);
+  return it->second.size();
+}
+
+// Variable node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const xtcl::xExpr& layer,
+                                 PrecisionType precision,
+                                 DataLayoutType layout) {
+  auto node = std::make_shared<Node>(precision, layout, Node::Role::kVar);
+  auto idx = Add(name, node);
+  CHECK_GE(idx, 1);
+  node->set_data(std::make_shared<xtcl::xExpr>(layer));
+  // Generate a unique name for the current XTCL layer
+  builder_.SetLayer(name + "__" + std::to_string(idx));
+  return node;
+}
+
+// Const or data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const Tensor& tensor,
+                                 std::vector<int64_t> shape,
+                                 DataLayoutType layout) {
+  std::shared_ptr<Node> node = nullptr;
+  PrecisionType precision = tensor.precision();
+  if (tensor.persistable()) {
+    // Const node
+    node = std::make_shared<Node>(precision, layout, Node::Role::kConst);
+    auto idx = Add(name, node);
+    CHECK_EQ(idx, 1);
+    node->set_data(std::make_shared<xtcl::xExpr>(builder_.CreateTensor(
+        name, CvtShape<xtcl::xIndexExpr>(shape), CvtPrecisionType(precision))));
+    params_.emplace(std::make_pair(name, *CvtTensor(tensor, shape, layout)));
+  } else {
+    // Data node
+    node = Add(name, shape, precision, layout);
+  }
+  return node;
+}
+
+// Data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout) {
+  auto node = std::make_shared<Node>(precision, layout, Node::Role::kData);
+  auto idx = Add(name, node);
+  CHECK_EQ(idx, 1);
+  node->set_data(std::make_shared<xtcl::xExpr>(builder_.CreateTensor(
+      name, CvtShape<xtcl::xIndexExpr>(shape), CvtPrecisionType(precision))));
+  return node;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/graph.h b/lite/kernels/xpu/bridges/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..dafd8d853210278220b79fdf58895484cbd89ec0
--- /dev/null
+++ b/lite/kernels/xpu/bridges/graph.h
@@ -0,0 +1,184 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <xtcl/xtcl.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+// Graph and node is defined to collect all of converted XTCL IR nodes
+class Node {
+ public:
+  enum class Role {
+    kVar = 0,
+    kConst,
+    kData,
+  };
+
+  Node(std::shared_ptr<xtcl::xExpr> data,
+       PrecisionType precision,
+       DataLayoutType layout,
+       Role role)
+      : data_(data), precision_(precision), layout_(layout), role_(role) {}
+  Node(PrecisionType precision, DataLayoutType layout, Role role)
+      : precision_(precision), layout_(layout), role_(role) {}
+
+  void set_data(std::shared_ptr<xtcl::xExpr> data) { data_ = data; }
+  void set_precision(PrecisionType precision) { precision_ = precision; }
+  void set_layout(DataLayoutType layout) { layout_ = layout; }
+  void set_role(Role role) { role_ = role; }
+
+  std::shared_ptr<xtcl::xExpr> data() { return data_; }
+  PrecisionType precision() const { return precision_; }
+  DataLayoutType layout() const { return layout_; }
+  Role role() const { return role_; }
+  bool is_var() const { return role_ == Role::kVar; }
+  bool is_const() const { return role_ == Role::kConst; }
+  bool is_data() const { return role_ == Role::kData; }
+
+ private:
+  std::shared_ptr<xtcl::xExpr> data_{nullptr};
+  PrecisionType precision_{PRECISION(kFloat)};
+  DataLayoutType layout_{DATALAYOUT(kNCHW)};
+  Role role_{Role::kVar};
+};
+
+class Graph {
+ public:
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Variable node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const xtcl::xExpr& layer,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  // Const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            std::vector<int64_t> shape,
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, tensor.dims().Vectorize(), layout);
+  }
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, dims.Vectorize(), layout);
+  }
+
+  // Const node
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            std::vector<int64_t> shape = {},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    if (shape.empty()) {
+      shape = {static_cast<int64_t>(data.size())};
+    } else {
+      int size = 1;
+      for (auto i : shape) {
+        size *= i;
+      }
+      CHECK_EQ(data.size(), size);
+    }
+    Tensor tensor;
+    tensor.Resize(shape);
+    tensor.set_persistable(true);
+    std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
+                reinterpret_cast<const uint8_t*>(data.data()),
+                data.size() * sizeof(T));
+    return Add(name, tensor, layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, data, dims.Vectorize(), layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            std::vector<int64_t> shape = {1},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    int64_t size = 1;
+    for (auto i : shape) {
+      size *= i;
+    }
+    std::vector<T> data(size, value);
+    return Add(name, data, shape, layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, value, dims.Vectorize(), layout);
+  }
+
+  // Data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            DDim dims,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, dims.Vectorize(), precision, layout);
+  }
+
+  std::shared_ptr<Node> Get(const std::string& name) {
+    CHECK(Has(name)) << "[XPU] Node " << name << " not found.";
+    return nodes_.at(name).back();
+  }
+
+  bool Has(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+ public:
+  // XPU network builder and constant tensors
+  xtcl::network::xNetworkBuilder builder_;
+  xtcl::network::xTensorCompiler::ParamNDArrayMap params_;
+
+ private:
+  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
+};
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/layer_norm_op.cc b/lite/kernels/xpu/bridges/layer_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ad190b73f59d7f1decf01c52d24799550daaea8
--- /dev/null
+++ b/lite/kernels/xpu/bridges/layer_norm_op.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto y_name = op_info->Output("Y").front();
+  auto y_type = kernel->GetOutputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+  auto axis = op_info->GetAttr<int>("begin_norm_axis");
+  auto x_rank = static_cast<int>(x_dims.size());
+  axis = axis < 0 ? (x_rank + axis) : axis;
+  bool reshape = axis != (x_rank - 1);  // XPU only support the last dimension
+  auto x_inner_size = x_dims.Slice(axis, x_rank).production();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+  if (reshape) {
+    auto reshaped_x_dims = x_dims.Slice(0, axis).Vectorize();
+    reshaped_x_dims.push_back(x_inner_size);
+    x_node = graph->Add(
+        x_name + "/reshape",
+        graph->builder_.CreateReshape(
+            *x_node->data(), CvtShape<xtcl::Integer>(reshaped_x_dims)));
+  }
+
+  // Scale node
+  std::shared_ptr<Node> scale_node = nullptr;
+  if (HasInputArg(op_info, scope, "Scale")) {
+    auto scale_name = op_info->Input("Scale").front();
+    auto scale_type = kernel->GetInputDeclType("Scale");
+    CHECK(scale_type->precision() == PRECISION(kFloat));
+    CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
+    auto scale = scope->FindMutableTensor(scale_name);
+    auto scale_dims = scale->dims();
+    CHECK_EQ(scale_dims.size(), 1);
+    CHECK_EQ(scale_dims.production(), x_inner_size);
+    scale_node = graph->Add(scale_name, *scale);
+  } else {
+    scale_node = graph->Add(y_name + "/scale_one", 1.0f, {x_inner_size});
+  }
+
+  // Bias node
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias_type = kernel->GetInputDeclType("Bias");
+    CHECK(bias_type->precision() == PRECISION(kFloat));
+    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+    CHECK_EQ(bias_dims.size(), 1);
+    CHECK_EQ(bias_dims.production(), x_inner_size);
+    bias_node = graph->Add(bias_name, *bias);
+  } else {
+    bias_node = graph->Add(y_name + "/bias_zero", 0.0f, {x_inner_size});
+  }
+
+  // Layer Norm node
+  auto layer_norm_node =
+      graph->Add(y_name,
+                 graph->builder_.CreateLayerNorm(*x_node->data(),
+                                                 *scale_node->data(),
+                                                 *bias_node->data(),
+                                                 axis,
+                                                 epsilon,
+                                                 true,
+                                                 true));
+  if (reshape) {
+    graph->Add(y_name,
+               graph->builder_.CreateReshape(*layer_norm_node->data(),
+                                             CvtShape<xtcl::Integer>(y_dims)));
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(layer_norm,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::LayerNormConverter);
diff --git a/lite/kernels/xpu/bridges/lookup_table_op.cc b/lite/kernels/xpu/bridges/lookup_table_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eecf50b5bd601e912483adb39154a7430bc05c9e
--- /dev/null
+++ b/lite/kernels/xpu/bridges/lookup_table_op.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto ids_name = op_info->Input("Ids").front();
+  auto ids_type = kernel->GetInputDeclType("Ids");
+  CHECK(ids_type->precision() == PRECISION(kInt64));
+  CHECK(ids_type->layout() == DATALAYOUT(kNCHW));
+  auto ids = scope->FindMutableTensor(ids_name);
+  auto ids_dims = ids->dims();
+  auto w_name = op_info->Input("W").front();
+  auto w_type = kernel->GetInputDeclType("W");
+  CHECK(w_type->precision() == PRECISION(kFloat));
+  CHECK(w_type->layout() == DATALAYOUT(kNCHW));
+  auto w = scope->FindMutableTensor(w_name);
+  auto w_dims = w->dims();
+  CHECK_EQ(w_dims.size(), 2);
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+  auto padding_idx = op_info->GetAttr<int64_t>("padding_idx");
+  if (padding_idx != -1) {
+    LOG(WARNING) << "[XPU] Only padding_idx=-1 is supported.";
+    return FAILED;
+  }
+
+  // Ids node
+  std::shared_ptr<Node> ids_node = nullptr;
+  if (graph->Has(ids_name)) {
+    ids_node = graph->Get(ids_name);
+  } else {
+    ids_node = graph->Add(ids_name, *ids);
+  }
+  // Flatten Ids node
+  if (ids_dims.size() != 1) {
+    ids_node =
+        graph->Add(ids_name + "/reshape",
+                   graph->builder_.CreateReshape(*ids_node->data(), {-1}),
+                   ids_node->precision(),
+                   ids_node->layout());
+  }
+
+  // W node
+  auto w_node = graph->Add(w_name, *w);
+
+  // Reshape the gather node with the inferred shape as the output node
+  auto gather_node =
+      graph->Add(out_name,
+                 graph->builder_.CreateGather(
+                     *w_node->data(), *ids_node->data(), /* axis= */ 0),
+                 w_node->precision(),
+                 w_node->layout());
+  if (out_dims.size() != 2) {
+    graph->Add(out_name,
+               graph->builder_.CreateReshape(*gather_node->data(),
+                                             CvtShape<xtcl::Integer>(out_dims)),
+               gather_node->precision(),
+               gather_node->layout());
+  }
+  return SUCCESS;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(lookup_table,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::LookupTableConverter);
diff --git a/lite/kernels/xpu/bridges/matmul_op.cc b/lite/kernels/xpu/bridges/matmul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c17ba8423c04eddf8b042c95e959d8b703c60c7a
--- /dev/null
+++ b/lite/kernels/xpu/bridges/matmul_op.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto y_name = op_info->Input("Y").front();
+  auto y_type = kernel->GetInputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+
+  auto transpose_x = op_info->GetAttr<bool>("transpose_X");
+  auto transpose_y = op_info->GetAttr<bool>("transpose_Y");
+  auto alpha = op_info->GetAttr<float>("alpha");
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Y node
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+  } else {
+    y_node = graph->Add(y_name, *y);
+  }
+
+  // Matmul node
+  if (x_dims.size() > 2 && y_dims.size() >= 2) {
+    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [B, M, K], y: [K, N], out: [B, M, N]
+    // Reshape and transposed X node
+    if (x_dims.size() != 3) {
+      auto m = static_cast<int>(x_dims[x_dims.size() - 2]);
+      auto k = static_cast<int>(x_dims[x_dims.size() - 1]);
+      x_node = graph->Add(
+          x_name + "/reshape",
+          graph->builder_.CreateReshape(*x_node->data(), {-1, m, k}));
+      if (transpose_x) {
+        x_node = graph->Add(
+            x_name + "/reshape/transpose",
+            graph->builder_.CreateTranspose(*x_node->data(), {0, 2, 1}));
+      }
+    }
+    // Reshape and transposed Y node
+    if (y_dims.size() != 3) {
+      auto k = static_cast<int>(y_dims[y_dims.size() - 2]);
+      auto n = static_cast<int>(y_dims[y_dims.size() - 1]);
+      y_node = graph->Add(
+          y_name + "/reshape",
+          graph->builder_.CreateReshape(*y_node->data(), {-1, k, n}));
+      if (!transpose_y) {
+        y_node = graph->Add(
+            y_name + "/reshape/transpose",
+            graph->builder_.CreateTranspose(*y_node->data(), {0, 2, 1}));
+      }
+    }
+    // Matmul node
+    auto matmul_node = graph->Add(
+        out_name,
+        graph->builder_.CreateBatchMatmul(*x_node->data(), *y_node->data()));
+    if (fabs(alpha - 1) > 1e-6f) {
+      matmul_node = graph->Add(
+          out_name, graph->builder_.CreateScale(*matmul_node->data(), alpha));
+    }
+    if (out_dims.size() != 3) {
+      graph->Add(out_name,
+                 graph->builder_.CreateReshape(
+                     *matmul_node->data(), CvtShape<xtcl::Integer>(out_dims)));
+    }
+  } else if (x_dims.size() == 2 && y_dims.size() == 2) {
+    // x: [M, K], y: [K, N], out: [M, N]
+    if (transpose_x) {
+      x_node =
+          graph->Add(x_name + "/transpose",
+                     graph->builder_.CreateTranspose(*x_node->data(), {1, 0}));
+    }
+    auto matmul_node =
+        graph->Add(out_name,
+                   graph->builder_.CreateMatmul2D(
+                       *x_node->data(), *y_node->data(), transpose_y));
+    if (fabs(alpha - 1) > 1e-6f) {
+      matmul_node = graph->Add(
+          out_name, graph->builder_.CreateScale(*matmul_node->data(), alpha));
+    }
+  } else if (x_dims.size() == 1 && y_dims.size() == 1) {
+    // x: [K], y: [K], out: [1]
+    // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
+    LOG(FATAL) << "[XPU] Not supported.";
+    return FAILED;
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(matmul,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::MatmulConverter);
diff --git a/lite/kernels/xpu/bridges/mul_op.cc b/lite/kernels/xpu/bridges/mul_op.cc
index 549abd3b1370a0fb90b4e9f4606ab15b3f9444ba..e12f767d13e4c1e01b671f5a4f7ba712dd8a1ef5 100644
--- a/lite/kernels/xpu/bridges/mul_op.cc
+++ b/lite/kernels/xpu/bridges/mul_op.cc
@@ -12,89 +12,94 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type MulConverter(const std::shared_ptr<lite::OpLite> op,
-                           graph_ctx_type* graph_ctx,
-                           const node_map_type& input_nodes) {
-  auto scope = op->scope();
+int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
-
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-
-  // get input, and attributes
-  auto x_var_name = op_info->Input("X").front();
-  auto y_var_name = op_info->Input("Y").front();
-  auto y_tensor = scope->FindMutableTensor(y_var_name);
-  auto y_dims = y_tensor->dims();
-  CHECK_EQ(y_dims.size(), 2) << "xpu now only support y_dims.size() == 2";
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
 
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto y_name = op_info->Input("Y").front();
+  auto y_type = kernel->GetInputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
   auto x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
-  CHECK_EQ(x_num_col_dims, 1) << "xpu now only support x_num_col_dims == 1";
-  auto y_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
-  CHECK_EQ(y_num_col_dims, 1) << "xpu now only support y_num_col_dims == 1";
-
-  // create x node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  x_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateBatchFlatten(*input_nodes.at(x_var_name)));
-  graph_ctx->builder->SetLayer(unique_op_type + "/X");
+  auto x_matrix_dims = x_dims.Flatten2D(x_num_col_dims);
+  auto y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims");
+  auto y_matrix_dims = y_dims.Flatten2D(y_num_col_dims);
+  CHECK_EQ(x_matrix_dims[1], y_matrix_dims[0]);
 
-  // transpose y
-  DDimLite y_dims_t(std::vector<int64_t>{1, 1});
-  y_dims_t[0] = y_dims[1];
-  y_dims_t[1] = y_dims[0];
-  auto y_var_name_t = unique_op_type + "/Y";
-  Tensor* y_tensor_t = new Tensor();
-  y_tensor_t->Resize(y_dims_t);
-  auto y_data_t = y_tensor_t->mutable_data<float>();
-  auto y_data = y_tensor->mutable_data<float>();
-  for (int i = 0; i < y_dims_t[0]; i++) {
-    for (int j = 0; j < y_dims_t[1]; j++) {
-      y_data_t[i * y_dims_t[1] + j] = y_data[j * y_dims_t[0] + i];
-    }
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+  // Flatten X node
+  if (x_dims.size() != 2) {
+    x_node = graph->Add(
+        x_name + "/reshape",
+        graph->builder_.CreateReshape(
+            *x_node->data(), {-1, static_cast<int>(x_matrix_dims[1])}));
   }
 
-  // create y node
-  std::shared_ptr<xtcl::xExpr> y_const_node = nullptr;
-  y_const_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-      y_var_name_t, lite::xpu::CvtShape(y_dims_t), ::xtcl::Float(32)));
-  auto y_const_tensor = lite::xpu::CvtTensor(y_tensor_t);
-  graph_ctx->params->emplace(std::make_pair(y_var_name_t, *y_const_tensor));
-  delete y_tensor_t;
-
-  // create mul node and set params from op
-  std::shared_ptr<xtcl::xExpr> mul_node = nullptr;
-  mul_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateDense(*x_node,
-                                      static_cast<int>(y_dims[1]),
-                                      ::xtcl::NullValue<::xtcl::DataType>(),
-                                      *y_const_node));
-  graph_ctx->builder->SetLayer(unique_op_type);
+  // Y node
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+  } else {
+    y_node = graph->Add(y_name, *y);
+  }
+  // Flatten Y node
+  if (y_dims.size() != 2) {
+    y_node = graph->Add(
+        y_name + "/reshape",
+        graph->builder_.CreateReshape(
+            *y_node->data(), {static_cast<int>(y_matrix_dims[0]), -1}));
+  }
 
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = mul_node;
-  return output_nodes;
-}
+  // Reshape the matmul node with the inferred shape as the output node
+  auto matmul_node = graph->Add(
+      out_name,
+      graph->builder_.CreateMatmul2D(*x_node->data(), *y_node->data(), false));
+  if (out_dims.size() != 2) {
+    graph->Add(out_name,
+               graph->builder_.CreateReshape(
+                   *matmul_node->data(), CvtShape<xtcl::Integer>(out_dims)));
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}  // namespace xpu
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(mul, paddle::lite::kernels::xpu::bridges::MulConverter);
+REGISTER_SUBGRAPH_BRIDGE(mul, kXPU, paddle::lite::subgraph::xpu::MulConverter);
diff --git a/lite/kernels/xpu/bridges/paddle_use_bridges.h b/lite/kernels/xpu/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c7886c5b2b431db7ba97d8557fb6a49750bd468
--- /dev/null
+++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(relu, kXPU);
+USE_SUBGRAPH_BRIDGE(tanh, kXPU);
+USE_SUBGRAPH_BRIDGE(conv2d, kXPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kXPU);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kXPU);
+USE_SUBGRAPH_BRIDGE(pool2d, kXPU);
+USE_SUBGRAPH_BRIDGE(softmax, kXPU);
+USE_SUBGRAPH_BRIDGE(mul, kXPU);
+USE_SUBGRAPH_BRIDGE(batch_norm, kXPU);
+USE_SUBGRAPH_BRIDGE(stack, kXPU);
+USE_SUBGRAPH_BRIDGE(gather, kXPU);
+USE_SUBGRAPH_BRIDGE(scale, kXPU);
+USE_SUBGRAPH_BRIDGE(lookup_table, kXPU);
+USE_SUBGRAPH_BRIDGE(slice, kXPU);
+USE_SUBGRAPH_BRIDGE(transpose, kXPU);
+USE_SUBGRAPH_BRIDGE(transpose2, kXPU);
+USE_SUBGRAPH_BRIDGE(reshape, kXPU);
+USE_SUBGRAPH_BRIDGE(reshape2, kXPU);
+USE_SUBGRAPH_BRIDGE(layer_norm, kXPU);
+USE_SUBGRAPH_BRIDGE(gelu, kXPU);
+USE_SUBGRAPH_BRIDGE(dropout, kXPU);
+USE_SUBGRAPH_BRIDGE(matmul, kXPU);
+USE_SUBGRAPH_BRIDGE(cast, kXPU);
diff --git a/lite/kernels/xpu/bridges/pool_op.cc b/lite/kernels/xpu/bridges/pool_op.cc
index fbc6a9919c446508afa5a3b8a1c35352f9b8ecfa..90653edcce26dd7da5ca0848368a98ea87a04c0d 100644
--- a/lite/kernels/xpu/bridges/pool_op.cc
+++ b/lite/kernels/xpu/bridges/pool_op.cc
@@ -12,30 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> op,
-                            graph_ctx_type* graph_ctx,
-                            const node_map_type& input_nodes) {
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
 
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-
-  // get input, and attributes
-  auto x_var_name = op_info->Input("X").front();
+  // Get input, and attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
   auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
   auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
@@ -44,54 +49,57 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> op,
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
   auto exclusive = op_info->GetAttr<bool>("exclusive");
 
-  // create pool node and set params from op
-  CHECK(input_nodes.count(x_var_name));
-  std::shared_ptr<xtcl::xExpr> pool_node = nullptr;
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Pool node
   if (pooling_type == "max") {
     if (global_pooling) {
-      pool_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateGlobalMaxPool2D(
-              *input_nodes.at(x_var_name)));
+      graph->Add(out_name,
+                 graph->builder_.CreateGlobalMaxPool2D(*x_node->data()));
     } else {
-      pool_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateMaxPool2D(*input_nodes.at(x_var_name),
-                                              lite::xpu::CvtShape(ksize),
-                                              lite::xpu::CvtShape(strides),
-                                              lite::xpu::CvtShape(paddings),
-                                              "NCHW",
-                                              ceil_mode));
+      graph->Add(
+          out_name,
+          graph->builder_.CreateMaxPool2D(*x_node->data(),
+                                          CvtShape<xtcl::xIndexExpr>(ksize),
+                                          CvtShape<xtcl::xIndexExpr>(strides),
+                                          CvtShape<xtcl::xIndexExpr>(paddings),
+                                          "NCHW",
+                                          ceil_mode));
     }
   } else if (pooling_type == "avg") {
     if (global_pooling) {
-      pool_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateGlobalAvgPool2D(
-              *input_nodes.at(x_var_name)));
+      graph->Add(out_name,
+                 graph->builder_.CreateGlobalAvgPool2D(*x_node->data()));
     } else {
-      pool_node = std::make_shared<xtcl::xExpr>(
-          // !exclusive ---> count_include_pad
-          graph_ctx->builder->CreateAvgPool2D(*input_nodes.at(x_var_name),
-                                              lite::xpu::CvtShape(ksize),
-                                              lite::xpu::CvtShape(strides),
-                                              lite::xpu::CvtShape(paddings),
-                                              "NCHW",
-                                              ceil_mode,
-                                              !exclusive));
+      // !exclusive ---> count_include_pad
+      graph->Add(
+          out_name,
+          graph->builder_.CreateAvgPool2D(*x_node->data(),
+                                          CvtShape<xtcl::xIndexExpr>(ksize),
+                                          CvtShape<xtcl::xIndexExpr>(strides),
+                                          CvtShape<xtcl::xIndexExpr>(paddings),
+                                          "NCHW",
+                                          ceil_mode,
+                                          !exclusive));
     }
   } else {
-    LOG(FATAL) << "Unsupported pooling type: " << pooling_type;
+    LOG(WARNING) << "[XPU] Unsupported pooling type: " << pooling_type;
+    return FAILED;
   }
-  graph_ctx->builder->SetLayer(unique_op_type);
-
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = pool_node;
-  return output_nodes;
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(pool2d, paddle::lite::kernels::xpu::bridges::PoolConverter);
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::PoolConverter);
diff --git a/lite/kernels/xpu/bridges/pool_op_test.cc b/lite/kernels/xpu/bridges/pool_op_test.cc
index ed5f922d59b5ca5e387076c9a533c4b4c251cc87..7efc6b464c00c945c71c8c5689e18823cde10f97 100644
--- a/lite/kernels/xpu/bridges/pool_op_test.cc
+++ b/lite/kernels/xpu/bridges/pool_op_test.cc
@@ -60,7 +60,7 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
 
   if (global_pooling == true) {
     for (int n = 0; n < in_n; ++n) {
@@ -162,7 +162,8 @@ void test_pool(int bs,
   opdesc.SetAttr("global_pooling", global_pooling);
   opdesc.SetAttr("exclusive", exclusive);
   opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int>({padding, padding, padding, padding}));
   opdesc.SetAttr("ceil_mode", ceil_mode);
 
   // create and convert op to XPU model, then run it on XPU
diff --git a/lite/kernels/xpu/bridges/registry.h b/lite/kernels/xpu/bridges/registry.h
deleted file mode 100644
index c990399c1cdeb865dc214d2f1c6d1970b6d27b85..0000000000000000000000000000000000000000
--- a/lite/kernels/xpu/bridges/registry.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <xtcl/xtcl.h>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/utils/macros.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-// xpu network builder and constant tensors
-class graph_ctx_type {
- public:
-  std::shared_ptr<xtcl::network::xNetworkBuilder> builder;
-  std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params;
-};
-
-// var_name, xpu node pointer
-using node_map_type =
-    std::unordered_map<std::string, std::shared_ptr<xtcl::xExpr>>;
-
-using func_type = std::function<node_map_type(
-    const std::shared_ptr<OpLite>, graph_ctx_type*, const node_map_type&)>;
-using cvt_map_type = std::unordered_map<std::string, func_type>;
-class Factory {
- public:
-  static Factory& Instance();
-
-  const cvt_map_type& AllFunctions() const { return map_; }
-  bool HasType(const std::string& op_type) const;
-  void Insert(const std::string& op_type, const func_type& func_name);
-  Factory() = default;
-
- private:
-  cvt_map_type map_;
-  DISALLOW_COPY_AND_ASSIGN(Factory);
-};
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// some platform-independent defintion
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
-  struct __test_global_namespace_##uniq_name##__ {};                          \
-  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
-                             __test_global_namespace_##uniq_name##__>::value, \
-                msg)
-
-#define REGISTER_XPU_BRIDGE(op_type, cvt_func_name)                         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                 \
-      __reg_xpu_bridge_##op_type##__,                                       \
-      "REGISTER_XPU_BRIDGE must be called in global namespace only once!"); \
-  int __reg_xpu_bridge_##op_type##_Insert() {                               \
-    paddle::lite::kernels::xpu::bridges::Factory::Instance().Insert(        \
-        #op_type, cvt_func_name);                                           \
-    return 0;                                                               \
-  }
-
-#define USE_XPU_BRIDGE(op_type)                                  \
-  extern int __reg_xpu_bridge_##op_type##_Insert();              \
-  static int __reg_xpu_bridge_##op_type##_Insert_return UNUSED = \
-      __reg_xpu_bridge_##op_type##_Insert();
diff --git a/lite/kernels/xpu/bridges/reshape_op.cc b/lite/kernels/xpu/bridges/reshape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e9a37d18e742e2843da1801cccc60e9202ccbcf
--- /dev/null
+++ b/lite/kernels/xpu/bridges/reshape_op.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/reshape_op.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto scope = op->scope();
+  auto op_type = op_info->Type();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  std::vector<int> shape;
+  if (HasInputArg(op_info, scope, "ShapeTensor")) {
+    auto shape_tensor_names = op_info->Input("ShapeTensor");
+    // auto shape_tensor_type = kernel->GetInputDeclType("ShapeTensor");
+    // CHECK(shape_tensor_type->precision() == PRECISION(kInt32));
+    // CHECK(shape_tensor_type->layout() == DATALAYOUT(kNCHW));
+    for (auto shape_tensor_name : shape_tensor_names) {
+      auto shape_tensor = scope->FindMutableTensor(shape_tensor_name);
+      CHECK(shape_tensor->persistable());
+      auto shape_tensor_data = shape_tensor->mutable_data<int>();
+      shape.emplace_back(shape_tensor_data[0]);
+    }
+    CHECK_GT(shape.size(), 0)
+        << "[XPU] ShapeError: When `shape` in ReshapeOp is a list or tuple "
+           "which contains Tensor, the shape's size can't be zero. "
+           "But received shape's size is "
+        << shape.size();
+  } else if (HasInputArg(op_info, scope, "Shape")) {
+    auto actual_shape_name = op_info->Input("Shape").front();
+    // auto actual_shape_type = kernel->GetInputDeclType("Shape");
+    // CHECK(actual_shape_type->precision() == PRECISION(kInt32));
+    // CHECK(actual_shape_type->layout() == DATALAYOUT(kNCHW));
+    auto actual_shape = scope->FindMutableTensor(actual_shape_name);
+    CHECK(actual_shape->persistable());
+    auto actual_shape_dims = actual_shape->dims();
+    auto actual_shape_data = actual_shape->mutable_data<int>();
+    auto shape = std::vector<int>(
+        actual_shape_data, actual_shape_data + actual_shape_dims.production());
+  } else if (op_info->HasAttr("shape")) {
+    shape = op_info->GetAttr<std::vector<int>>("shape");
+  } else {
+    LOG(WARNING) << "[XPU] No new shape for reshape op";
+    return FAILED;
+  }
+  auto out_dims = operators::ValidateShape(shape, x_dims);
+
+  // Reshape node
+  graph->Add(out_name,
+             graph->builder_.CreateReshape(*x_node->data(),
+                                           CvtShape<xtcl::Integer>(out_dims)),
+             x_node->precision(),
+             x_node->layout());
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(reshape2,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(reshape,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::ReshapeConverter);
diff --git a/lite/kernels/xpu/bridges/scale_op.cc b/lite/kernels/xpu/bridges/scale_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e6871390ac2690fa2e439ae56e59e49f342777e4
--- /dev/null
+++ b/lite/kernels/xpu/bridges/scale_op.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  float scale = op_info->GetAttr<float>("scale");
+  bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
+  float bias = op_info->GetAttr<float>("bias");
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Scale node
+  graph->Add(out_name,
+             graph->builder_.CreateScale(
+                 *x_node->data(), scale, bias, bias_after_scale));
+  return SUCCESS;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(scale,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::ScaleConverter);
diff --git a/lite/kernels/xpu/bridges/slice_op.cc b/lite/kernels/xpu/bridges/slice_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e4592d454ae9b79a51606ed9108c0ef17878276
--- /dev/null
+++ b/lite/kernels/xpu/bridges/slice_op.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input_type = kernel->GetInputDeclType("Input");
+  CHECK(input_type->precision() == PRECISION(kFloat));
+  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto axes = op_info->GetAttr<std::vector<int>>("axes");
+  auto starts = op_info->GetAttr<std::vector<int>>("starts");
+  auto ends = op_info->GetAttr<std::vector<int>>("ends");
+
+  // Input node
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+
+  // Calculate the begin and end of the slice in all of
+  // dimensions and Create slice node as the output node
+  xtcl::Array<xtcl::Integer> begin, end, strides;
+  for (size_t i = 0; i < input_dims.size(); ++i) {
+    auto it = std::find(axes.cbegin(), axes.cend(), i);
+    if (it == axes.cend()) {
+      // If not found, don't slice this axis
+      int s = 0;
+      int e = input_dims[i];
+      begin.push_back(s);
+      end.push_back(e);
+      strides.push_back(1);
+    } else {
+      int offset = it - axes.cbegin();
+      int s = starts[offset];
+      int e = ends[offset];
+      begin.push_back(s);
+      end.push_back(e);
+      strides.push_back(1);
+    }
+  }
+  graph->Add(out_name,
+             graph->builder_.CreateStridedSlice(
+                 *input_node->data(), begin, end, strides));
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(slice,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::SliceConverter);
diff --git a/lite/kernels/xpu/bridges/softmax_op.cc b/lite/kernels/xpu/bridges/softmax_op.cc
index 3972496762a1d399ab59e7a69b0e9e18a9c28300..740764015082a4c21bdef443e76e90065b2a99cb 100644
--- a/lite/kernels/xpu/bridges/softmax_op.cc
+++ b/lite/kernels/xpu/bridges/softmax_op.cc
@@ -12,50 +12,55 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> op,
-                               graph_ctx_type* graph_ctx,
-                               const node_map_type& input_nodes) {
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
-
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-
-  // get op's attributes
-  auto x_var_name = op_info->Input("X").front();
-  auto axis = op_info->GetAttr<int>("axis");
-
-  // create softmax node and set params from ops
-  CHECK(input_nodes.count(x_var_name));
-  std::shared_ptr<xtcl::xExpr> softmax_node = nullptr;
-  softmax_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateSoftmax(*input_nodes.at(x_var_name), axis));
-  graph_ctx->builder->SetLayer(unique_op_type);
-
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = softmax_node;
-  return output_nodes;
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  int axis = op_info->HasAttr("axis") ? op_info->GetAttr<int>("axis") : -1;
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Softmax node
+  graph->Add(out_name, graph->builder_.CreateSoftmax(*x_node->data(), axis));
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(softmax,
-                    paddle::lite::kernels::xpu::bridges::SoftmaxConverter);
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::SoftmaxConverter);
diff --git a/lite/kernels/xpu/bridges/softmax_op_test.cc b/lite/kernels/xpu/bridges/softmax_op_test.cc
deleted file mode 100644
index 2cd12cbf4e8dc108ac43fec55a568ecec72a51ab..0000000000000000000000000000000000000000
--- a/lite/kernels/xpu/bridges/softmax_op_test.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/softmax_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-template <typename dtype>
-void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->data<dtype>();
-  auto out_data = out->mutable_data<dtype>();
-  DDim x_dims = x->dims();
-
-  auto x_rank = x_dims.size();
-  int axis = op_info->GetAttr<int>("axis");
-  if (axis < 0) {
-    axis += x_rank;
-  }
-  int axis_size = x_dims[axis];
-  int outer_num = x_dims.Slice(0, axis).production();
-  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
-  int compute_size = outer_num * inner_num;
-  for (int i = 0; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int start = idx_outer * inner_num + idx_inner;
-    int offset;
-
-    offset = start;
-    dtype max_data = std::numeric_limits<dtype>::lowest();
-    for (int j = 0; j < axis_size; j++) {
-      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
-      offset += inner_num;
-    }
-
-    offset = start;
-    dtype sum_data = (dtype)0;
-    for (int j = 0; j < axis_size; j++) {
-      out_data[offset] = exp(x_data[offset] - max_data);
-      sum_data += out_data[offset];
-      offset += inner_num;
-    }
-
-    offset = start;
-    for (int j = 0; j < axis_size; j++) {
-      out_data[offset] /= sum_data;
-      offset += inner_num;
-    }
-  }
-}
-
-void test_softmax(int bs, int ic, int ih, int iw, int axis) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("softmax");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  // create and convert op to XPU model, then run it on XPU
-  auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  softmax_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(XPUBridges, softmax) {
-  for (auto bs : {2, 3}) {
-    for (auto ic : {4}) {
-      for (auto ih : {5}) {
-        for (auto iw : {6}) {
-          for (auto axis : {-3, -1, 0, 1, 2, 3}) {
-            test_softmax(bs, ic, ih, iw, axis);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(softmax);
-USE_XPU_BRIDGE(softmax);
diff --git a/lite/kernels/xpu/bridges/stack_op.cc b/lite/kernels/xpu/bridges/stack_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..69673aaebaf0a112fe5b1339b6e253a3c3a0334b
--- /dev/null
+++ b/lite/kernels/xpu/bridges/stack_op.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int StackConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_names = op_info->Input("X");
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto y_name = op_info->Output("Y").front();
+  auto y_type = kernel->GetOutputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  int axis = op_info->GetAttr<int>("axis");
+
+  // X nodes
+  xtcl::Array<xtcl::xExpr> x_nodes;
+  for (auto& x_name : x_names) {
+    auto x = scope->FindMutableTensor(x_name);
+    auto x_dims = x->dims();
+    std::shared_ptr<Node> x_node = nullptr;
+    if (graph->Has(x_name)) {
+      x_node = graph->Get(x_name);
+    } else {
+      x_node = graph->Add(x_name, *x);
+    }
+    x_nodes.push_back(*x_node->data());
+  }
+
+  // Stack node
+  graph->Add(y_name,
+             graph->builder_.CreateStack(
+                 xtcl::network::TupleNode::make(x_nodes), axis));
+  return SUCCESS;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(stack,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::StackConverter);
diff --git a/lite/kernels/xpu/bridges/test_helper.cc b/lite/kernels/xpu/bridges/test_helper.cc
deleted file mode 100644
index 1a19324b946203c008093136d7a207ffaf23fbd6..0000000000000000000000000000000000000000
--- a/lite/kernels/xpu/bridges/test_helper.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/xpu/bridges/test_helper.h"
-#include <utility>
-#include "lite/backends/xpu/builder.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-#include "lite/operators/graph_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-void LauchOp(const std::shared_ptr<lite::OpLite> op,
-             const std::vector<std::string>& input_var_names,
-             const std::vector<std::string>& output_var_names) {
-  auto scope = op->scope();
-  auto op_type = op->op_info()->Type();
-
-  // convert lite op to XPU op
-  const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  CHECK(bridges.HasType(op_type));
-  graph_ctx_type graph_ctx;
-  graph_ctx.builder = std::make_shared<xtcl::network::xNetworkBuilder>();
-  graph_ctx.params =
-      std::make_shared<xtcl::network::xTensorCompiler::ParamNDArrayMap>();
-  node_map_type input_nodes;
-  for (auto input_var_name : input_var_names) {
-    auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
-    auto input_node = std::make_shared<xtcl::xExpr>(
-        graph_ctx.builder->CreateTensor(input_var_name,
-                                        lite::xpu::CvtShape(input->dims()),
-                                        ::xtcl::Float(32)));
-    input_nodes[input_var_name] = input_node;
-  }
-  auto output_nodes = supported_lists.at(op_type)(op, &graph_ctx, input_nodes);
-  CHECK_GT(output_nodes.size(), 0);
-
-  // build network graph and output model data
-  std::vector<std::shared_ptr<xtcl::xExpr>> ordered_output_nodes;
-  for (auto output_var_name : output_var_names) {
-    ordered_output_nodes.push_back(output_nodes.at(output_var_name));
-  }
-  std::string weight_var_name = "weight";
-  auto weight = scope->Var(weight_var_name)->GetMutable<Tensor>();
-  weight->set_persistable(true);
-  weight->set_precision(PRECISION(kInt8));
-  CHECK(lite::xpu::BuildModel(
-      graph_ctx.builder, graph_ctx.params, &ordered_output_nodes, weight));
-  CHECK_GT(weight->numel(), 0);
-  CHECK(weight->data<uint8_t>() != nullptr);
-
-  // create graph op and set inputs and outputs
-  cpp::OpDesc graph_op_desc;
-  graph_op_desc.SetType("graph_op");
-  graph_op_desc.SetInput("Inputs", input_var_names);
-  graph_op_desc.SetInput("Weight", {weight_var_name});
-  graph_op_desc.SetOutput("Outputs", output_var_names);
-
-  auto graph_op =
-      std::make_shared<operators::GraphOpLite>(graph_op_desc.Type());
-  graph_op->SetValidPlaces({Place{TARGET(kXPU), PRECISION(kFloat)}});
-  CHECK(graph_op->Attach(graph_op_desc, scope));
-  CHECK(graph_op->CheckShape());
-  CHECK(graph_op->InferShape());
-
-  // create graph op kernel and set XPU context
-  auto graph_kernels =
-      graph_op->CreateKernels({Place{TARGET(kXPU), PRECISION(kFloat)}});
-  CHECK(!graph_kernels.empty());
-  auto graph_kernel =
-      std::move(graph_kernels.front());  // use the first kernel by default
-  auto graph_device = ContextScheduler::Global().NewContext(TARGET(kXPU));
-  graph_kernel->SetContext(std::move(graph_device));
-
-  // perform graph op kernel and store to output variables
-  graph_kernel->Launch();
-
-  lite::xpu::DeviceInfo::Global().Clear();
-}
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(graph_op);
-USE_LITE_KERNEL(graph_op, kXPU, kFloat, kNCHW, def);
diff --git a/lite/kernels/xpu/bridges/test_helper.h b/lite/kernels/xpu/bridges/test_helper.h
deleted file mode 100644
index c8bba5da66550a9eccaefa8b2d9a31a233f5f706..0000000000000000000000000000000000000000
--- a/lite/kernels/xpu/bridges/test_helper.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <random>
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-template <typename T>
-std::shared_ptr<T> CreateOp(const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  auto op = std::make_shared<T>(opdesc.Type());
-  op->SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)},
-                      Place{TARGET(kARM), PRECISION(kFloat)},
-                      Place{TARGET(kXPU), PRECISION(kFloat)}});
-  CHECK(op->Attach(opdesc, scope));
-  CHECK(op->CheckShape());
-  CHECK(op->InferShape());
-  return op;
-}
-
-// T is the target data type
-// R is the range data type, e.g. int, half
-template <typename T, typename R = float>
-void FillTensor(Tensor* x,
-                T lower = static_cast<T>(-2),
-                T upper = static_cast<T>(2)) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-
-  T* x_data = x->mutable_data<T>();
-  for (int i = 0; i < x->dims().production(); ++i) {
-    auto r = uniform_dist(rng) * (upper - lower) + lower;
-    x_data[i] = static_cast<T>(static_cast<R>(r));
-  }
-}
-
-void LauchOp(const std::shared_ptr<lite::OpLite> op,
-             const std::vector<std::string>& input_var_names,
-             const std::vector<std::string>& output_var_names);
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/transpose_op.cc b/lite/kernels/xpu/bridges/transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4217fe0119be8584f0ca83408dca92100e652076
--- /dev/null
+++ b/lite/kernels/xpu/bridges/transpose_op.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto axis = op_info->GetAttr<std::vector<int>>("axis");
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Transpose node
+  graph->Add(out_name,
+             graph->builder_.CreateTranspose(
+                 *x_node->data(),
+                 CvtShape<xtcl::Integer>(
+                     std::vector<int64_t>(axis.begin(), axis.end()))));
+
+  return SUCCESS;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(transpose,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(transpose2,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::TransposeConverter);
diff --git a/lite/kernels/xpu/bridges/utility.cc b/lite/kernels/xpu/bridges/utility.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce28f38019baa9752cc59e4dea1b2b1d1afc9fbc
--- /dev/null
+++ b/lite/kernels/xpu/bridges/utility.cc
@@ -0,0 +1,137 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/bridges/utility.h"
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+
+xtcl::DataType CvtPrecisionType(PrecisionType in_type) {
+  xtcl::DataType out_type = ::xtcl::Float(32);
+  switch (in_type) {
+    case PRECISION(kFloat):
+      out_type = ::xtcl::Float(32);
+      break;
+    case PRECISION(kInt8):
+      out_type = ::xtcl::Int(8);
+      break;
+    case PRECISION(kInt16):
+      out_type = ::xtcl::Int(16);
+      break;
+    case PRECISION(kInt32):
+      out_type = ::xtcl::Int(32);
+      break;
+    case PRECISION(kInt64):
+      out_type = ::xtcl::Int(64);
+      break;
+    default:
+      LOG(FATAL) << "[XPU] Can not convert precision type("
+                 << PrecisionToStr(in_type) << ") from Lite to XPU";
+      break;
+  }
+  return out_type;
+}
+
+DLDataType CvtDLDataType(PrecisionType in_type) {
+  DLDataType out_type = {kDLFloat, 32, 1};
+  switch (in_type) {
+    case PRECISION(kFloat):
+      out_type = {kDLFloat, 32, 1};
+      break;
+    case PRECISION(kInt8):
+      out_type = {kDLInt, 8, 1};
+      break;
+    case PRECISION(kInt16):
+      out_type = {kDLInt, 16, 1};
+      break;
+    case PRECISION(kInt32):
+      out_type = {kDLInt, 32, 1};
+      break;
+    case PRECISION(kInt64):
+      out_type = {kDLInt, 64, 1};
+      break;
+    default:
+      LOG(FATAL) << "[XPU] Can not convert precision type("
+                 << PrecisionToStr(in_type) << ") from Lite to XPU DLDataType";
+      break;
+  }
+  return out_type;
+}
+
+DLDeviceType CvtDLDeviceType(TargetType in_type) {
+  DLDeviceType out_type = kDLCPU;
+  switch (in_type) {
+    case TARGET(kX86):
+      out_type = kDLCPU;
+      break;
+    case TARGET(kHost):
+      out_type = kDLCPU;
+      break;
+    case TARGET(kCUDA):
+      out_type = kDLGPU;
+      break;
+    case TARGET(kXPU):
+      out_type = static_cast<DLDeviceType>(kDLXPU);
+      break;
+    default:
+      LOG(FATAL) << "[XPU] Can not convert target type(" << TargetToStr(in_type)
+                 << ") from Lite to XPU DLDeviceType";
+      break;
+  }
+  return out_type;
+}
+
+std::shared_ptr<xtcl::xNDArray> CvtTensor(const Tensor& in_tensor,
+                                          std::vector<int64_t> out_shape,
+                                          DataLayoutType in_layout) {
+  PrecisionType in_precision = in_tensor.precision();
+  auto in_shape = in_tensor.dims().Vectorize();
+  if (out_shape.empty()) {
+    out_shape = in_shape;
+  }
+  auto out_tensor = std::make_shared<xtcl::xNDArray>(
+      xtcl::xNDArray::Empty(out_shape,
+                            CvtDLDataType(in_precision),
+                            {CvtDLDeviceType(TARGET(kHost)), 0}));
+  auto out_data =
+      reinterpret_cast<uint8_t*>(out_tensor->ToDLPack()->dl_tensor.data);
+  std::memcpy(out_data, in_tensor.raw_data(), in_tensor.memory_size());
+  return out_tensor;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/builder.h b/lite/kernels/xpu/bridges/utility.h
similarity index 60%
rename from lite/backends/xpu/builder.h
rename to lite/kernels/xpu/bridges/utility.h
index f0ac2b303aac7fa7f827e6e2f8f0fdf614b604b5..776955854567b919234e7c79dcf6321e8e24b70a 100644
--- a/lite/backends/xpu/builder.h
+++ b/lite/kernels/xpu/bridges/utility.h
@@ -17,44 +17,50 @@
 #include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 #include "lite/core/op_lite.h"
-#include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
 
 namespace paddle {
 namespace lite {
+namespace subgraph {
 namespace xpu {
 
+// Type/tensor converters for converting Paddle type/tensor to XPU type/tensor
 bool HasInputArg(const OpInfo* op_info,
                  const Scope* scope,
                  const std::string& argname);
 
-std::string UniqueName(const std::string& prefix);
-
 xtcl::DataType CvtPrecisionType(PrecisionType in_type);
 
-DLDataType CvtDataType(PrecisionType in_type);
+DLDataType CvtDLDataType(PrecisionType in_type);
+DLDeviceType CvtDLDeviceType(TargetType in_type);
 
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int>& in_shape);
+template <typename T>
+xtcl::Array<T> CvtShape(const std::vector<int>& in_shape) {
+  xtcl::Array<T> out_shape;
+  for (auto dim : in_shape) {
+    out_shape.push_back(dim);
+  }
+  return out_shape;
+}
 
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape);
+template <typename T>
+xtcl::Array<T> CvtShape(const std::vector<int64_t>& in_shape) {
+  return CvtShape<T>(std::vector<int>(in_shape.begin(), in_shape.end()));
+}
 
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims);
+template <typename T>
+xtcl::Array<T> CvtShape(const DDim& in_dims) {
+  return CvtShape<T>(in_dims.Vectorize());
+}
 
 std::shared_ptr<xtcl::xNDArray> CvtTensor(
-    Tensor* in_tensor,
+    const Tensor& in_tensor,
     std::vector<int64_t> out_shape = {},
-    PrecisionType in_ptype = PRECISION(kFloat),
-    DataLayoutType in_ltype = DATALAYOUT(kNCHW));
-
-bool BuildModel(
-    std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
-    std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
-    std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
-    lite::Tensor* model);
+    DataLayoutType in_layout = DATALAYOUT(kNCHW));
 
 }  // namespace xpu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/xpu/graph_compute.cc b/lite/kernels/xpu/graph_compute.cc
deleted file mode 100644
index b9e5be1a1d5c764c378f3fdf29d73148743962a4..0000000000000000000000000000000000000000
--- a/lite/kernels/xpu/graph_compute.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/xpu/graph_compute.h"
-#include <sys/time.h>
-#include <time.h>
-#include <string>
-#include <vector>
-#include "lite/backends/xpu/runtime.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-
-void GraphCompute::PrepareForRun() {
-  // auto& ctx = this->ctx_->template As<XPUContext>();
-  auto& param = this->Param<param_t>();
-  CHECK(param.weight);
-  CHECK(lite::xpu::LoadModel(*param.weight, &runtime_));
-  CHECK(runtime_ != nullptr);
-}
-
-void GraphCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  auto start_time = GetCurrentUS();
-  for (int i = 0; i < param.inputs.size(); i++) {
-    auto input_var_name = param.inputs[i].first;
-    auto input_tensor = param.inputs[i].second;
-    LOG(INFO) << "input dims[" << i << ":" << input_var_name
-              << "]: " << input_tensor->dims();
-    auto input_tensor_data = input_tensor->data<float>();
-    for (int j = 0; j < input_tensor->dims().production(); j++) {
-      VLOG(3) << input_tensor_data[j];
-    }
-    auto input_ndarray = xtcl::xNDArray::Empty(
-        input_tensor->dims().Vectorize(), {kDLFloat, 32, 1}, {kDLCPU, 0});
-    auto input_ndarray_data =
-        static_cast<float*>(input_ndarray.ToDLPack()->dl_tensor.data);
-    std::memcpy(input_ndarray_data,
-                input_tensor_data,
-                sizeof(float) * input_tensor->dims().production());
-    runtime_->SetInputZeroCopy(input_var_name,
-                               &input_ndarray.ToDLPack()->dl_tensor);
-  }
-  runtime_->Run();
-  for (int i = 0; i < param.outputs.size(); i++) {
-    auto output_ndarray = runtime_->GetOutput(i);
-    auto output_var_name = param.outputs[i].first;
-    auto output_tensor = param.outputs[i].second;
-    output_tensor->Resize(output_ndarray.Shape());
-    LOG(INFO) << "output dims[" << i << ":" << output_var_name
-              << "]: " << output_tensor->dims();
-    auto output_ndarray_data =
-        static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data);
-    auto output_tensor_data = output_tensor->mutable_data<float>();
-    std::memcpy(output_tensor_data,
-                output_ndarray_data,
-                sizeof(float) * output_tensor->dims().production());
-    for (int j = 0; j < output_tensor->dims().production(); j++) {
-      VLOG(3) << output_tensor_data[j];
-    }
-  }
-  LOG(INFO) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
-}
-
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(graph_op,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::GraphCompute,
-                     def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4d170f67351a473ee9d306999e95eea1dd9ea25
--- /dev/null
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -0,0 +1,229 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/backends/xpu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+int SubgraphEngine::BuildDeviceProgram() {
+  int status = 0;
+  // Convert all of ops and their input vars and weights and added into the XPU
+  // IR graph
+  subgraph::xpu::Graph graph;
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = inst.op();
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kXPU))) {
+      return subgraph::FAILED;
+    }
+    auto kernel = inst.kernel();
+    status |=
+        bridges.Select(op_type, TARGET(kXPU))(reinterpret_cast<void*>(&graph),
+                                              const_cast<OpLite*>(op),
+                                              const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  // Obtain the output nodes of the XPU IR graph and build the graph to the XPU
+  // runtime
+  device_inames_.clear();
+  device_onames_.clear();
+  std::vector<xtcl::xExpr*> device_inodes;
+  std::vector<xtcl::xExpr*> device_onodes;
+  for (auto& input_name : input_names_) {
+    if (graph.Has(input_name)) {
+      if (graph.Get(input_name)->is_data()) {
+        device_inodes.push_back(graph.Get(input_name)->data().get());
+        device_inames_.push_back(input_name);
+      } else {
+        LOG(WARNING) << "[XPU] Input node " << input_name
+                     << " is ignored because it is not a data node.";
+      }
+    } else {
+      LOG(WARNING) << "[XPU] Input node " << input_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+  for (auto& output_name : output_names_) {
+    if (graph.Has(output_name)) {
+      device_onodes.push_back(graph.Get(output_name)->data().get());
+      device_onames_.push_back(output_name);
+    } else {
+      LOG(WARNING) << "[XPU] Output node " << output_name
+                   << " is ignored because it does not exist.";
+    }
+  }
+  CHECK(!device_inames_.empty())
+      << "[XPU] No input nodes found for building XPU model";
+  CHECK(!device_onames_.empty())
+      << "[XPU] No output nodes found for building XPU model";
+  device_program_ = lite::xpu::Device::Global().Build(
+      &graph.builder_, &graph.params_, &device_onodes);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[XPU] Build model failed!";
+    return subgraph::FAILED;
+  }
+
+  // Query and check the dimensions of input and output tensors
+  origin_idims_.resize(device_inames_.size());
+  origin_itensors_.resize(device_inames_.size());
+  device_itensors_.resize(device_inames_.size());
+  origin_odims_.resize(device_onames_.size());
+  origin_otensors_.resize(device_onames_.size());
+  device_otensors_.resize(device_onames_.size());
+  for (int i = 0; i < device_inames_.size(); i++) {
+    auto node = graph.Get(device_inames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+    VLOG(3) << "[XPU] Inputs[" << i << "] name: " << device_inames_[i]
+            << " precision: " << PrecisionToStr(precision)
+            << " layout: " << DataLayoutToStr(layout)
+            << " dims: " << origin_idims_[i];
+    // Prepare the device input tensors which share data with the origin input
+    // tensors
+    device_itensors_[i].data = nullptr;
+    device_itensors_[i].ctx.device_type =
+        subgraph::xpu::CvtDLDeviceType(TARGET(kHost));
+    device_itensors_[i].ctx.device_id = 0;
+    device_itensors_[i].ndim = origin_idims_[i].size();
+    device_itensors_[i].dtype = subgraph::xpu::CvtDLDataType(precision);
+    device_itensors_[i].shape = const_cast<int64_t*>(
+        static_cast<const int64_t*>(origin_idims_[i].data().data()));
+    device_itensors_[i].strides = nullptr;
+    device_itensors_[i].byte_offset = 0;
+  }
+  for (int i = 0; i < device_onames_.size(); i++) {
+    auto node = graph.Get(device_onames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
+    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    VLOG(3) << "[XPU] Outputs[" << i << "] name: " << device_onames_[i]
+            << " precision: " << PrecisionToStr(precision)
+            << " layout: " << DataLayoutToStr(layout)
+            << " dims: " << origin_odims_[i];
+    // Prepare the device output tensors which share data with the origin output
+    // tensors
+    switch (precision) {
+      case PRECISION(kFloat):
+        origin_otensors_[i]->mutable_data<float>();
+        break;
+      case PRECISION(kInt8):
+        origin_otensors_[i]->mutable_data<int8_t>();
+        break;
+      case PRECISION(kInt16):
+        origin_otensors_[i]->mutable_data<int16_t>();
+        break;
+      case PRECISION(kInt32):
+        origin_otensors_[i]->mutable_data<int32_t>();
+        break;
+      case PRECISION(kInt64):
+        origin_otensors_[i]->mutable_data<int64_t>();
+        break;
+      default:
+        LOG(FATAL) << "[XPU] " << device_onames_[i]
+                   << " can't mutable data with precision type "
+                   << PrecisionToStr(precision);
+        break;
+    }
+    device_otensors_[i].data = nullptr;
+    device_otensors_[i].ctx.device_type =
+        subgraph::xpu::CvtDLDeviceType(TARGET(kHost));
+    device_otensors_[i].ctx.device_id = 0;
+    device_otensors_[i].ndim = origin_odims_[i].size();
+    device_otensors_[i].dtype = subgraph::xpu::CvtDLDataType(precision);
+    device_otensors_[i].shape = const_cast<int64_t*>(
+        static_cast<const int64_t*>(origin_odims_[i].data().data()));
+    device_otensors_[i].strides = nullptr;
+    device_otensors_[i].byte_offset = 0;
+  }
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  for (size_t i = 0; i < device_itensors_.size(); i++) {
+    // Update the data pointer of DLTensor to track the origin input tensors
+    device_itensors_[i].data =
+        const_cast<void*>(origin_itensors_[i]->raw_data());
+    device_program_->SetInput(device_inames_[i], &device_itensors_[i]);
+  }
+  // Run the XPU model
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  auto start_time = GetCurrentUS();
+  device_program_->Run();
+  VLOG(3) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
+  for (size_t i = 0; i < device_otensors_.size(); i++) {
+    // Update the data pointer of DLTensor to track the origin output tensors
+    device_otensors_[i].data =
+        const_cast<void*>(origin_otensors_[i]->raw_data());
+    device_program_->CopyOutputTo(i, &device_otensors_[i]);
+  }
+  return 0;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c21a1b7b054fd642f330ee95bff972f581e65c6b
--- /dev/null
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <xtcl/xtcl.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+
+  std::vector<std::string> device_inames_;
+  std::vector<std::string> device_onames_;
+  std::vector<DLTensor> device_itensors_;
+  std::vector<DLTensor> device_otensors_;
+  std::unique_ptr<xtcl::network::xRuntimeInstance> device_program_{nullptr};
+};
+
+class SubgraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/compatible_pb.cc b/lite/model_parser/compatible_pb.cc
index 2df4a92270466b1f3b56dec8deecf8e9a8e62390..d1131539bf30abba22feeba8abf009f95ab70a00 100644
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
@@ -32,14 +32,6 @@ namespace lite {
 /// For VarDesc transfrom
 #define TRANS_VAR_ANY_WITH_CPP_IMPL(T)                           \
   template <>                                                    \
-  void TransformVarDescAnyToCpp<T>(const T &any_desc,            \
-                                   cpp::VarDesc *cpp_desc) {     \
-    cpp_desc->SetName(any_desc.Name());                          \
-    cpp_desc->SetType(any_desc.GetType());                       \
-    cpp_desc->SetPersistable(any_desc.Persistable());            \
-  }                                                              \
-                                                                 \
-  template <>                                                    \
   void TransformVarDescCppToAny<T>(const cpp::VarDesc &cpp_desc, \
                                    T *any_desc) {                \
     any_desc->SetName(cpp_desc.Name());                          \
@@ -47,6 +39,25 @@ namespace lite {
     any_desc->SetPersistable(cpp_desc.Persistable());            \
   }
 
+#ifndef LITE_ON_TINY_PUBLISH
+template <>
+void TransformVarDescAnyToCpp<pb::VarDesc>(const pb::VarDesc &any_desc,
+                                           cpp::VarDesc *cpp_desc) {
+  cpp_desc->SetName(any_desc.Name());
+  cpp_desc->SetType(any_desc.GetType());
+  cpp_desc->SetPersistable(any_desc.Persistable());
+  cpp_desc->SetDataType(any_desc.GetDataType());
+}
+#endif
+
+template <>
+void TransformVarDescAnyToCpp<naive_buffer::VarDesc>(
+    const naive_buffer::VarDesc &any_desc, cpp::VarDesc *cpp_desc) {
+  cpp_desc->SetName(any_desc.Name());
+  cpp_desc->SetType(any_desc.GetType());
+  cpp_desc->SetPersistable(any_desc.Persistable());
+}
+
 /// For OpDesc transform
 template <typename OpDescType>
 void OpInputsAnyToCpp(const OpDescType &any_desc, cpp::OpDesc *cpp_desc) {
diff --git a/lite/model_parser/cpp/var_desc.h b/lite/model_parser/cpp/var_desc.h
index c346934dfd721bcd6424fcf2b9d22a0ded9dab14..9232bba3e8620b2e5e769c9f7a0f50969abe8421 100644
--- a/lite/model_parser/cpp/var_desc.h
+++ b/lite/model_parser/cpp/var_desc.h
@@ -42,9 +42,14 @@ class VarDesc : public VarDescAPI {
 
   void SetPersistable(bool persistable) override { persistable_ = persistable; }
 
+  Type GetDataType() const { return data_type_; }
+
+  void SetDataType(Type data_type) { data_type_ = data_type; }
+
  private:
   std::string name_;
   Type type_;
+  Type data_type_;
   bool persistable_;
 };
 
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index ed3f45c598e74a0450454c15ad0cd9ad09266f8e..42d132b418c9bf806d35ad2d8f302b190ce660e2 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -20,6 +20,7 @@
 #include "lite/core/scope.h"
 #include "lite/core/tensor.h"
 #include "lite/core/variable.h"
+#include "lite/core/version.h"
 #include "lite/model_parser/desc_apis.h"
 #include "lite/model_parser/naive_buffer/combined_params_desc.h"
 #include "lite/model_parser/naive_buffer/param_desc.h"
@@ -45,6 +46,7 @@ int SizeOfType(framework::proto::VarType::Type type) {
     DO(FP16, float);
     DO(FP32, float);
     DO(INT8, int8_t);
+    DO(INT16, int16_t);
     DO(INT32, int);
     DO(INT64, int64_t);
 #undef DO
@@ -535,40 +537,56 @@ void SaveCombinedParamsNaive(const std::string &path,
   }
 
   pt_desc.Save();
-  table.SaveToFile(path);
+  table.AppendToFile(path);
 }
 
 void SaveModelNaive(const std::string &model_dir,
                     const Scope &exec_scope,
                     const cpp::ProgramDesc &cpp_prog,
                     bool combined) {
-  MkDirRecur(model_dir);
   // Save program
-  const std::string prog_path = model_dir + "/__model__.nb";
+  const std::string prog_path = model_dir + ".nb";
   naive_buffer::BinaryTable table;
   naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
   naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
   TransformProgramDescCppToAny(cpp_prog, &nb_prog);
   nb_proto_prog.Save();
-  table.SaveToFile(prog_path);
 
+  // Save meta_version(uint16) into file
+  naive_buffer::BinaryTable meta_version_table;
+  meta_version_table.Require(sizeof(uint16_t));
+  uint16_t meta_version = 0;
+  memcpy(meta_version_table.cursor(), &meta_version, sizeof(uint16_t));
+  meta_version_table.Consume(sizeof(uint16_t));
+  meta_version_table.SaveToFile(prog_path);
+
+  // Save lite_version(char[16]) into file
+  const int paddle_version_length = 16 * sizeof(char);
+  naive_buffer::BinaryTable paddle_version_table;
+  paddle_version_table.Require(paddle_version_length);
+  std::string paddle_version = version();
+  memcpy(paddle_version_table.cursor(),
+         paddle_version.c_str(),
+         paddle_version_length);
+  paddle_version_table.Consume(paddle_version_length);
+  paddle_version_table.AppendToFile(prog_path);
+  VLOG(4) << "paddle_version:" << paddle_version;
+
+  // Save topology_size(uint64) into file
+  naive_buffer::BinaryTable topology_size_table;
+  topology_size_table.Require(sizeof(uint64_t));
+  uint64_t topology_size = table.size();
+  memcpy(topology_size_table.cursor(), &topology_size, sizeof(uint64_t));
+  topology_size_table.Consume(sizeof(uint64_t));
+  topology_size_table.AppendToFile(prog_path);
+
+  // save topology data into model file
+  table.AppendToFile(prog_path);
   // Save Params
-  // NOTE: Only main block be used now.
-  if (combined) {
-    const std::string combined_params_path = model_dir + "/param.nb";
-    SaveCombinedParamsNaive(combined_params_path, exec_scope, cpp_prog);
-  } else {
-    auto prog = cpp_prog;
-    auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
-    for (size_t i = 0; i < main_block_desc.VarsSize(); ++i) {
-      auto &var = *main_block_desc.GetVar<cpp::VarDesc>(i);
-      if (var.Name() == "feed" || var.Name() == "fetch" || !var.Persistable())
-        continue;
-      const std::string path = model_dir + "/" + var.Name() + ".nb";
-      SaveParamNaive(path, exec_scope, var.Name());
-    }
-  }
-  LOG(INFO) << "Save naive buffer model in '" << model_dir << "' successfully";
+  SaveCombinedParamsNaive(prog_path, exec_scope, cpp_prog);
+
+  LOG(INFO) << "Save naive buffer model in '" << model_dir
+            << ".nb' successfully";
 }
 #endif
 
@@ -637,14 +655,15 @@ void LoadParamNaive(const std::string &path,
 }
 
 void LoadCombinedParamsNaive(const std::string &path,
+                             const uint64_t &offset,
                              lite::Scope *scope,
                              const cpp::ProgramDesc &cpp_prog,
                              bool params_from_memory) {
   naive_buffer::BinaryTable table;
   if (params_from_memory) {
-    table.LoadFromMemory(path.c_str(), path.length());
+    table.LoadFromMemory(path.c_str() + offset, path.length() - offset);
   } else {
-    table.LoadFromFile(path);
+    table.LoadFromFile(path, offset, 0);
   }
   naive_buffer::proto::CombinedParamsDesc pt_desc(&table);
   pt_desc.Load();
@@ -677,6 +696,13 @@ void LoadModelNaive(const std::string &model_dir,
   CHECK(scope);
   cpp_prog->ClearBlocks();
 
+  LOG(WARNING)
+      << "WARNING: MobileConfig::set_model_dir and "
+         "MobileConfig::set_model_buffer are deprecated APIs "
+         "and will be removed in latter release. \n"
+         "    MobileConfig::set_model_from_file(const std::string& model_file)"
+         " and MobileConfig::set_model_from_buffer(const std::string& "
+         "model_buffer) are recommended.";
   // Load model
   const std::string prog_path = model_dir + "/__model__.nb";
   naive_buffer::BinaryTable table;
@@ -692,7 +718,7 @@ void LoadModelNaive(const std::string &model_dir,
   // NOTE: Only main block be used now.
   if (combined) {
     const std::string combined_params_path = model_dir + "/param.nb";
-    LoadCombinedParamsNaive(combined_params_path, scope, *cpp_prog, false);
+    LoadCombinedParamsNaive(combined_params_path, 0, scope, *cpp_prog, false);
   } else {
     auto &prog = *cpp_prog;
     auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
@@ -717,6 +743,96 @@ void LoadModelNaive(const std::string &model_dir,
   VLOG(4) << "Load naive buffer model in '" << model_dir << "' successfully";
 }
 
+/*
+ * Binary structure of naive_buffer model: model.nb
+ * ----------------------------------------------------------
+ * |       |    PART         |   Precision |   Length(byte) |
+ * |   1   |  meta_version   |   uint16_t  |       2        |
+ * |   2   |  opt_version    |   char[16]  |      16        |
+ * |   3   |  topo_size      |   uint64_t  |       8        |
+ * |   4   |  topo_data      |   char[]    | topo_size byte |
+ * |   5   |  param_data     |   char[]    |                |
+ * ----------------------------------------------------------
+ *  Meaning of each part:
+ *      meta_version: meata_version, 0 default.
+ *      opt_version:  lite_version of opt tool that transformed this model.
+ *      topo_size:    length of `topo_data`.
+ *      topo_data:    contains model's topology data.
+ *      param_data:   contains model's params data.
+*/
+
+// usage: LoadModelNaiveFromFile is used for loading model from file.
+template <typename T>
+void ReadModelDataFromFile(T *data,
+                           const std::string &prog_path,
+                           uint64_t *offset,
+                           const uint64_t &size) {
+  naive_buffer::BinaryTable data_table;
+  data_table.LoadFromFile(prog_path, *offset, size);
+  memcpy(data, data_table.cursor(), size);
+  *offset = *offset + size;
+}
+
+void LoadModelNaiveFromFile(const std::string &filename,
+                            Scope *scope,
+                            cpp::ProgramDesc *cpp_prog) {
+  CHECK(cpp_prog);
+  CHECK(scope);
+  cpp_prog->ClearBlocks();
+  // ModelFile
+  const std::string prog_path = filename;
+
+  // Offset
+  uint64_t offset = 0;
+
+  // (1)get meta version
+  uint16_t meta_version;
+  ReadModelDataFromFile<uint16_t>(
+      &meta_version, prog_path, &offset, sizeof(uint16_t));
+  VLOG(4) << "Meta_version:" << meta_version;
+
+  // (2)get opt version
+  char opt_version[16];
+  const uint64_t opt_version_length = 16 * sizeof(char);
+  ReadModelDataFromFile<char>(
+      opt_version, prog_path, &offset, opt_version_length);
+  VLOG(4) << "Opt_version:" << opt_version;
+
+  // check version, opt's version should be consistent with current Paddle-Lite
+  // version.
+  const std::string paddle_version = version();
+  const std::string opt_version_str = opt_version;
+  if (paddle_version != opt_version_str) {
+    LOG(WARNING) << "warning: the version of opt that transformed this model "
+                    "is not consistent with current Paddle-Lite version."
+                    "\n      version of opt:"
+                 << opt_version
+                 << "\n      version of current Paddle-Lite:" << paddle_version;
+  }
+
+  // (3)get topo_size
+  uint64_t topo_size;
+  ReadModelDataFromFile<uint64_t>(
+      &topo_size, prog_path, &offset, sizeof(uint64_t));
+
+  // (4)get topo data
+  naive_buffer::BinaryTable topo_table;
+  topo_table.LoadFromFile(prog_path, offset, topo_size);
+  offset = offset + topo_size;
+  // transform topo_data into cpp::ProgramDesc
+  naive_buffer::proto::ProgramDesc nb_proto_prog(&topo_table);
+  nb_proto_prog.Load();
+  naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
+  TransformProgramDescAnyToCpp(nb_prog, cpp_prog);
+
+  // (5)Load Params
+  LoadCombinedParamsNaive(prog_path, offset, scope, *cpp_prog, false);
+
+  VLOG(4) << "Load naive buffer model in '" << filename << "' successfully";
+}
+
+// warning: this is an old inference and is not suggested.
+// todo: this inference will be abandened in release/v3.0.0
 void LoadModelNaiveFromMemory(const std::string &model_buffer,
                               const std::string &param_buffer,
                               Scope *scope,
@@ -740,7 +856,64 @@ void LoadModelNaiveFromMemory(const std::string &model_buffer,
   // Load Params
   // NOTE: Only main block be used now.
   // only combined Params are supported in Loading Model from memory
-  LoadCombinedParamsNaive(param_buffer, scope, *cpp_prog, true);
+  LoadCombinedParamsNaive(param_buffer, 0, scope, *cpp_prog, true);
+
+  VLOG(4) << "Load model from naive buffer memory successfully";
+}
+
+// usage: LoadModelNaiveFromMemory is used for loading naive model from memory
+template <typename T>
+void ReadModelDataFromBuffer(T *data,
+                             const std::string &model_buffer,
+                             uint64_t *offset,
+                             const uint64_t &size) {
+  naive_buffer::BinaryTable data_table;
+  data_table.LoadFromMemory(model_buffer.c_str() + *offset, size);
+  memcpy(data, data_table.cursor(), size);
+  *offset = *offset + size;
+}
+void LoadModelNaiveFromMemory(const std::string &model_buffer,
+                              Scope *scope,
+                              cpp::ProgramDesc *cpp_prog) {
+  CHECK(cpp_prog);
+  CHECK(scope);
+  cpp_prog->ClearBlocks();
+
+  // Offset
+  uint64_t offset = 0;
+
+  // (1)get meta version
+  uint16_t meta_version;
+  ReadModelDataFromBuffer<uint16_t>(
+      &meta_version, model_buffer, &offset, sizeof(uint16_t));
+  VLOG(4) << "Meta_version:" << meta_version;
+
+  // (2)get opt version
+  char opt_version[16];
+  const uint64_t paddle_version_length = 16 * sizeof(char);
+  ReadModelDataFromBuffer<char>(
+      opt_version, model_buffer, &offset, paddle_version_length);
+  VLOG(4) << "Opt_version:" << opt_version;
+
+  // (3)get topo_size and topo_data
+  uint64_t topo_size;
+  ReadModelDataFromBuffer<uint64_t>(
+      &topo_size, model_buffer, &offset, sizeof(uint64_t));
+  naive_buffer::BinaryTable table;
+  table.LoadFromMemory(model_buffer.c_str() + offset, topo_size);
+  offset = offset + topo_size;
+
+  naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
+  nb_proto_prog.Load();
+  naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
+
+  // Transform to cpp::ProgramDesc
+  TransformProgramDescAnyToCpp(nb_prog, cpp_prog);
+
+  // Load Params
+  // NOTE: Only main block be used now.
+  // only combined Params are supported in Loading Model from memory
+  LoadCombinedParamsNaive(model_buffer, offset, scope, *cpp_prog, true);
 
   VLOG(4) << "Load model from naive buffer memory successfully";
 }
diff --git a/lite/model_parser/model_parser.h b/lite/model_parser/model_parser.h
index bca7533c24af517994dae677c7b63e088f2ef1ca..e4641f69ada380c91f69280290dd020ea27d2ad1 100644
--- a/lite/model_parser/model_parser.h
+++ b/lite/model_parser/model_parser.h
@@ -94,15 +94,22 @@ void LoadParamNaive(const std::string& path,
                     lite::Scope* scope,
                     const std::string& name);
 
+// warning:this old inference will be abandened in release/v3.0.0
+// and LoadModelNaiveFromFile is suggested.
 void LoadModelNaive(const std::string& model_dir,
                     lite::Scope* scope,
                     cpp::ProgramDesc* prog,
                     bool combined = true);
-
+void LoadModelNaiveFromFile(const std::string& filename,
+                            lite::Scope* scope,
+                            cpp::ProgramDesc* prog);
 void LoadModelNaiveFromMemory(const std::string& model_buffer,
                               const std::string& param_buffer,
                               lite::Scope* scope,
                               cpp::ProgramDesc* cpp_prog);
+void LoadModelNaiveFromMemory(const std::string& model_buffer,
+                              lite::Scope* scope,
+                              cpp::ProgramDesc* cpp_prog);
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/model_parser_test.cc b/lite/model_parser/model_parser_test.cc
index 58083027849cc007bce80bd10004d0a13259fda7..d9c0f501c37862236cacd2624dc70c8cf1dacc86 100644
--- a/lite/model_parser/model_parser_test.cc
+++ b/lite/model_parser/model_parser_test.cc
@@ -121,17 +121,23 @@ TEST(ModelParser, SaveModelNaive) {
   SaveModelNaive(save_pb_model_path, scope, prog);
 }
 
+TEST(ModelParser, LoadModelNaiveFromFile) {
+  CHECK(!FLAGS_model_dir.empty());
+  cpp::ProgramDesc prog;
+  Scope scope;
+
+  auto model_path = std::string(FLAGS_model_dir) + ".saved.naive.nb";
+  LoadModelNaiveFromFile(model_path, &scope, &prog);
+}
+
 TEST(ModelParser, LoadModelNaiveFromMemory) {
   CHECK(!FLAGS_model_dir.empty());
   cpp::ProgramDesc prog;
   Scope scope;
 
-  auto model_path = std::string(FLAGS_model_dir) + ".saved.naive/__model__.nb";
-  auto params_path = std::string(FLAGS_model_dir) + ".saved.naive/param.nb";
+  auto model_path = std::string(FLAGS_model_dir) + ".saved.naive.nb";
   std::string model_buffer = lite::ReadFile(model_path);
-  std::string params_buffer = lite::ReadFile(params_path);
-
-  LoadModelNaiveFromMemory(model_buffer, params_buffer, &scope, &prog);
+  LoadModelNaiveFromMemory(model_buffer, &scope, &prog);
 }
 
 }  // namespace lite
diff --git a/lite/model_parser/naive_buffer/naive_buffer.cc b/lite/model_parser/naive_buffer/naive_buffer.cc
index cefaf0c28a34a70c095362e9972c9ef99d5fa80c..02538602fb5b5ae319d1041d501a87c212e47d2d 100644
--- a/lite/model_parser/naive_buffer/naive_buffer.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer.cc
@@ -44,24 +44,37 @@ void BinaryTable::SaveToFile(const std::string &filename) const {
   fclose(fp);
 }
 
-void BinaryTable::LoadFromFile(const std::string &filename) {
-  // get file size
+void BinaryTable::AppendToFile(const std::string &filename) const {
+  FILE *fp = fopen(filename.c_str(), "ab");
+  CHECK(fp) << "Unable to open file: " << filename;
+  if (fwrite(reinterpret_cast<const char *>(data()), 1, size(), fp) != size()) {
+    fclose(fp);
+    LOG(FATAL) << "Write file error: " << filename;
+  }
+  fclose(fp);
+}
+
+void BinaryTable::LoadFromFile(const std::string &filename,
+                               const size_t &offset,
+                               const size_t &size) {
+  // open file in readonly mode
   FILE *fp = fopen(filename.c_str(), "rb");
   CHECK(fp) << "Unable to open file: " << filename;
-  fseek(fp, 0L, SEEK_END);
-  size_t file_size = ftell(fp);
-  LOG(INFO) << "file size " << file_size;
-
-  // load data.
-  fseek(fp, 0L, SEEK_SET);
-  Require(file_size);
-  if (fread(reinterpret_cast<char *>(&bytes_[0]), 1, file_size, fp) !=
-      file_size) {
+  // move fstream pointer backward for size of offset
+  size_t buffer_size = size;
+  if (size == 0) {
+    fseek(fp, 0L, SEEK_END);
+    buffer_size = ftell(fp) - offset;
+  }
+  fseek(fp, offset, SEEK_SET);
+  Require(buffer_size);
+  // read data of `size` into binary_data_variable:`bytes_`
+  if (fread(reinterpret_cast<char *>(&bytes_[0]), 1, buffer_size, fp) !=
+      buffer_size) {
     fclose(fp);
     LOG(FATAL) << "Read file error: " << filename;
   }
   fclose(fp);
-
   // Set readonly.
   is_mutable_mode_ = false;
 }
diff --git a/lite/model_parser/naive_buffer/naive_buffer.h b/lite/model_parser/naive_buffer/naive_buffer.h
index 717dd3c5a6b0c48d6a1f2ae0d7dba9f08a6d99f3..5be17856a25aabfed81ae88d80e788c8dd2be4bc 100644
--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ b/lite/model_parser/naive_buffer/naive_buffer.h
@@ -61,8 +61,12 @@ struct BinaryTable {
 
   /// Serialize the table to a binary buffer.
   void SaveToFile(const std::string& filename) const;
+  void AppendToFile(const std::string& filename) const;
 
-  void LoadFromFile(const std::string& filename);
+  //  void LoadFromFile(const std::string& filename);
+  void LoadFromFile(const std::string& filename,
+                    const size_t& offset = 0,
+                    const size_t& size = 0);
   void LoadFromMemory(const char* buffer, size_t buffer_size);
 };
 
@@ -128,19 +132,23 @@ using Float64Builder = PrimaryBuilder<double>;
 
 template <typename Primary>
 class PrimaryListBuilder : public FieldBuilder {
-  std::vector<Primary> data_;
+  const Primary* data_{nullptr};
+  int size_{0};
 
  public:
   using value_type = Primary;
 
   explicit PrimaryListBuilder(BinaryTable* table) : FieldBuilder(table) {}
-  PrimaryListBuilder(BinaryTable* table, const std::vector<Primary>& val)
-      : FieldBuilder(table), data_(val) {}
+  PrimaryListBuilder(BinaryTable* table, const Primary* val, int size)
+      : FieldBuilder(table), data_(val), size_(size) {}
 
   /// Set data.
-  void set(const std::vector<Primary>& x) { data_ = x; }
+  void set(const Primary* x, int size) {
+    data_ = x;
+    size_ = size;
+  }
 
-  const std::vector<Primary>& data() const { return data_; }
+  const Primary* data() const { return data_; }
 
   /// Save information to the corresponding BinaryTable.
   void Save() override;
@@ -149,14 +157,12 @@ class PrimaryListBuilder : public FieldBuilder {
   void Load() override;
 
   /// Number of elements.
-  size_t size() const { return data_.size(); }
+  size_t size() const { return size_; }
 
-  Type type() const override {
-    return core::StdTypeToRepr<std::vector<Primary>>();
-  }
+  Type type() const override { return core::StdTypeToRepr<const Primary*>(); }
 
   /// clear builder
-  void Clear() { data_.clear(); }
+  void Clear() { size_ = 0; }
 
   ~PrimaryListBuilder() = default;
 };
@@ -381,17 +387,14 @@ void PrimaryBuilder<Primary>::Load() {
 
 template <typename Primary>
 void PrimaryListBuilder<Primary>::Load() {
-  CHECK(data_.empty()) << "Duplicate load";
+  CHECK(data_ == nullptr) << "Duplicate load";
   // Load number of elements first.
   uint64_t num_elems{};
   memcpy(&num_elems, table()->cursor(), sizeof(uint64_t));
   table()->Consume(sizeof(uint64_t));
 
-  data_.resize(num_elems);
-  for (uint64_t i = 0; i < num_elems; i++) {
-    memcpy(&data_[i], table()->cursor(), sizeof(value_type));
-    table()->Consume(sizeof(value_type));
-  }
+  set(reinterpret_cast<Primary*>(table()->cursor()), num_elems);
+  table()->Consume(num_elems * sizeof(value_type));
 }
 
 template <typename Primary>
@@ -404,7 +407,7 @@ void PrimaryListBuilder<Primary>::Save() {
 
   table()->Require(num_elems * sizeof(value_type));
   memcpy(table()->cursor(),
-         reinterpret_cast<byte_t*>(&data_[0]),
+         reinterpret_cast<const byte_t*>(data_),
          num_elems * sizeof(value_type));
   table()->Consume(num_elems * sizeof(value_type));
 }
diff --git a/lite/model_parser/naive_buffer/param_desc.cc b/lite/model_parser/naive_buffer/param_desc.cc
index 4397b3c413e8a09d2e5e5b41b8f9222bcfab4e20..99d6552d4689e102bbfd7d95f025a9ef0fc92fcf 100644
--- a/lite/model_parser/naive_buffer/param_desc.cc
+++ b/lite/model_parser/naive_buffer/param_desc.cc
@@ -150,12 +150,10 @@ void ParamDesc::SetDim(const std::vector<int64_t>& dim) {
         << "Data Type mismatch";                                            \
     std::vector<T> res;                                                     \
     auto& data_builder = desc_->GetField<PrimaryListBuilder<char>>("data"); \
-    auto& data = data_builder.data();                                       \
-    size_t size = data.size() / sizeof(T);                                  \
-    auto* data_ptr = reinterpret_cast<const T*>(&data[0]);                  \
-    for (size_t i = 0; i < size; ++i) {                                     \
-      res.push_back(data_ptr[i]);                                           \
-    }                                                                       \
+    auto data = data_builder.data();                                        \
+    size_t size = data_builder.size() / sizeof(T);                          \
+    res.resize(size);                                                       \
+    memcpy(&res[0], data, data_builder.size());                             \
     return res;                                                             \
   }
 
@@ -178,8 +176,7 @@ GET_DATA_IMPL(double, FP64);
   data_builder->Clear();                                        \
   size_t size = size__ * sizeof(T);                             \
   auto* data_ptr = reinterpret_cast<const char*>(data_ptr__);   \
-  std::vector<char> data_vec(data_ptr, data_ptr + size);        \
-  data_builder->set(data_vec);
+  data_builder->set(data_ptr, size);
 
 #define SET_DATA_IMPL(T, type__)                                \
   template <>                                                   \
diff --git a/lite/model_parser/naive_buffer/var_desc.cc b/lite/model_parser/naive_buffer/var_desc.cc
index cccf7582912d1edff2c91fbfa5ed602f028be648..86b6dd72844c694dee1781d322491bf922f32d09 100644
--- a/lite/model_parser/naive_buffer/var_desc.cc
+++ b/lite/model_parser/naive_buffer/var_desc.cc
@@ -99,6 +99,32 @@ const proto::VarType& VarDesc::GetVarType() const {
   return desc_->GetField<proto::VarType>("type");
 }
 
+VarDescAPI::VarDataType VarDesc::GetDataType() const {
+  using data_type_builder_t = EnumBuilder<proto::VarDataType>;
+
+  auto data_type = desc_->GetField<proto::TensorDesc>("tensor_desc")
+                       .GetField<data_type_builder_t>("data_type")
+                       .data();
+#define GET_DATA_TYPE_CASE_ITEM(type__) \
+  case proto::VarDataType::type__:      \
+    return VarDescAPI::VarDataType::type__
+
+  switch (data_type) {
+    // Only support primary data type now.
+    GET_DATA_TYPE_CASE_ITEM(UINT8);
+    GET_DATA_TYPE_CASE_ITEM(INT8);
+    GET_DATA_TYPE_CASE_ITEM(INT16);
+    GET_DATA_TYPE_CASE_ITEM(INT32);
+    GET_DATA_TYPE_CASE_ITEM(INT64);
+    GET_DATA_TYPE_CASE_ITEM(FP32);
+    GET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      LOG(FATAL) << "Unknown var data type";
+  }
+  return VarDescAPI::VarDataType();
+#undef GET_DATA_TYPE_CASE_ITEM
+}
+
 proto::VarType* VarDesc::GetMutableVarType() {
   auto* builder = desc_->GetMutableField<proto::VarType>("type");
   CHECK(builder);
diff --git a/lite/model_parser/naive_buffer/var_desc.h b/lite/model_parser/naive_buffer/var_desc.h
index 92a0cfe3cdc5bf8a397bb1b8140dba0312791730..b638afd79d085e64ef7f1174f0d27975b827e76a 100644
--- a/lite/model_parser/naive_buffer/var_desc.h
+++ b/lite/model_parser/naive_buffer/var_desc.h
@@ -51,6 +51,8 @@ class VarDesc : public VarDescAPI {
 
   void SetPersistable(bool persistable) override;
 
+  VarDescAPI::VarDataType GetDataType() const;
+
  private:
   const proto::VarType &GetVarType() const;
   proto::VarType *GetMutableVarType();
diff --git a/lite/model_parser/pb/var_desc.cc b/lite/model_parser/pb/var_desc.cc
index 517f4cc6dcefbb5e517b6f84ac1b695dbbbc5925..a3f28d00b94054addd728775e9373d73f9b7b729 100644
--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
@@ -151,8 +151,36 @@ void VarDesc::SetDataTypes(
   }
 }
 
-proto::VarType::Type VarDesc::GetDataType() const {
-  return tensor_desc().data_type();
+// proto::VarType::Type VarDesc::GetDataType() const {
+//   return tensor_desc().data_type();
+// }
+VarDescAPI::VarDataType VarDesc::GetDataType() const {
+  CHECK(desc_->has_type()) << "The var's type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  if (desc_->type().type() != proto::VarType::LOD_TENSOR) {
+    return VarDescAPI::Type();
+  }
+  auto type = tensor_desc().data_type();
+#define GET_DATA_TYPE_CASE_ITEM(type__)             \
+  case proto::VarType::Type::VarType_Type_##type__: \
+    return VarDescAPI::Type::type__
+
+  switch (type) {
+    GET_DATA_TYPE_CASE_ITEM(BOOL);
+    GET_DATA_TYPE_CASE_ITEM(SIZE_T);
+    GET_DATA_TYPE_CASE_ITEM(UINT8);
+    GET_DATA_TYPE_CASE_ITEM(INT8);
+    GET_DATA_TYPE_CASE_ITEM(INT16);
+    GET_DATA_TYPE_CASE_ITEM(INT32);
+    GET_DATA_TYPE_CASE_ITEM(INT64);
+    GET_DATA_TYPE_CASE_ITEM(FP16);
+    GET_DATA_TYPE_CASE_ITEM(FP32);
+    GET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      LOG(FATAL) << "Unknown var type: " << static_cast<int>(type);
+      return VarDescAPI::Type();
+  }
+#undef GET_DATA_TYPE_CASE_ITEM
 }
 
 std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
diff --git a/lite/model_parser/pb/var_desc.h b/lite/model_parser/pb/var_desc.h
index c0ac6316016df3cdb06ddede9c78d58540a40864..bbf78b75d3f1b1a4a6488e28380f2587ca77bbc4 100644
--- a/lite/model_parser/pb/var_desc.h
+++ b/lite/model_parser/pb/var_desc.h
@@ -89,7 +89,8 @@ class VarDesc : public VarDescAPI {
   void SetDataTypes(
       const std::vector<framework::proto::VarType::Type> &multiple_data_type);
 
-  framework::proto::VarType::Type GetDataType() const;
+  // framework::proto::VarType::Type GetDataType() const;
+  VarDescAPI::VarDataType GetDataType() const;
 
   std::vector<framework::proto::VarType::Type> GetDataTypes() const;
 
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 30637c00b569ca327476a67cd69bc4201c893965..ccc9c825db8a8a5030c6481ee0e33b8f324f4d11 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -2,12 +2,10 @@ set(op_DEPS tensor op op_params scope memory)
 
 lite_cc_library(op_params SRCS op_params.cc DEPS tensor any)
 
+# 1.baisc ops used in basic models
 add_operator(conv_op basic SRCS conv_op.cc DEPS ${op_DEPS})
 add_operator(pool_op basic SRCS pool_op.cc DEPS ${op_DEPS})
 add_operator(fc_op basic SRCS fc_op.cc DEPS ${op_DEPS})
-add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS})
-add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS})
-add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS})
 add_operator(mul_op basic SRCS mul_op.cc DEPS ${op_DEPS})
 add_operator(matmul_op basic SRCS matmul_op.cc DEPS ${op_DEPS})
 add_operator(scale_op basic SRCS scale_op.cc DEPS ${op_DEPS})
@@ -16,70 +14,83 @@ add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} )
 add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS})
 add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS})
 add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS})
-add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS})
-add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
 add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
 add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS})
-add_operator(lrn_op_lite basic SRCS lrn_op.cc DEPS ${op_DEPS})
-add_operator(decode_bboxes_op_lite basic SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
 add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS})
 add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
-add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS})
 add_operator(mean_op basic SRCS mean_op.cc DEPS ${op_DEPS})
 add_operator(fill_constant_op basic SRCS fill_constant_op.cc DEPS ${op_DEPS})
-#add_operator(sgd_op basic SRCS sgd_op.cc DEPS ${op_DEPS})
-add_operator(uniform_random_op basic SRCS uniform_random_op.cc DEPS ${op_DEPS})
-add_operator(power_op basic SRCS power_op.cc DEPS ${op_DEPS})
 add_operator(shuffle_channel_op basic SRCS shuffle_channel_op.cc DEPS ${op_DEPS})
 add_operator(yolo_box_op basic SRCS yolo_box_op.cc DEPS ${op_DEPS})
 add_operator(interpolate_op basic SRCS interpolate_op.cc DEPS ${op_DEPS})
 add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS})
-add_operator(axpy_op basic SRCS axpy_op.cc DEPS ${op_DEPS})
-add_operator(gru_unit_op basic SRCS gru_unit_op.cc DEPS ${op_DEPS})
-add_operator(gru_op basic SRCS gru_op.cc DEPS ${op_DEPS})
-add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
-add_operator(layout_once_op basic SRCS layout_once_op.cc DEPS ${op_DEPS})
 add_operator(prior_box_op basic SRCS prior_box_op.cc DEPS ${op_DEPS})
-add_operator(density_prior_box_op basic SRCS density_prior_box_op.cc DEPS ${op_DEPS})
-add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
 add_operator(concat_op basic SRCS concat_op.cc DEPS ${op_DEPS})
 add_operator(pad2d_op basic SRCS pad2d_op.cc DEPS ${op_DEPS})
-add_operator(negative_op basic SRCS negative_op.cc DEPS ${op_DEPS})
-add_operator(crop_op basic SRCS crop_op.cc DEPS ${op_DEPS})
 add_operator(calib_op basic SRCS calib_op.cc DEPS ${op_DEPS})
-add_operator(calib_once_op basic SRCS calib_once_op.cc DEPS ${op_DEPS})
 add_operator(split_op basic SRCS split_op.cc DEPS ${op_DEPS})
 add_operator(transpose_op basic SRCS transpose_op.cc DEPS ${op_DEPS})
 add_operator(fake_quant basic SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
 add_operator(fake_dequant basic SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS})
 add_operator(conv_transpose_op basic SRCS conv_transpose_op.cc DEPS ${op_DEPS})
-add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS})
 add_operator(expand_op_lite basic SRCS expand_op.cc DEPS ${op_DEPS})
-add_operator(reduce_max_op_lite basic SRCS reduce_max_op.cc DEPS ${op_DEPS})
-add_operator(norm_op basic SRCS norm_op.cc DEPS ${op_DEPS})
-add_operator(shape_op_lite basic SRCS shape_op.cc DEPS ${op_DEPS})
-add_operator(sequence_expand_op_lite basic SRCS sequence_expand_op.cc DEPS ${op_DEPS})
 add_operator(squeeze_op_lite basic SRCS squeeze_op.cc DEPS ${op_DEPS})
-add_operator(unsqueeze_op_lite extra SRCS unsqueeze_op.cc DEPS ${op_DEPS})
-add_operator(im2sequence_op basic SRCS im2sequence_op.cc DEPS ${op_DEPS})
+add_operator(unsqueeze_op_lite basic SRCS unsqueeze_op.cc DEPS ${op_DEPS})
+add_operator(stack_op basic SRCS stack_op.cc DEPS ${op_DEPS})
+add_operator(cast_op_lite basic SRCS cast_op.cc DEPS ${op_DEPS})
+add_operator(affine_channel_op basic SRCS affine_channel_op.cc DEPS ${op_DEPS})
+add_operator(range_op basic SRCS range_op.cc DEPS ${op_DEPS})
+add_operator(reduce_mean_op basic SRCS reduce_mean_op.cc DEPS ${op_DEPS})
+add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS})
+add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS})
+add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS})
+add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
+add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
+add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
+add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS})
+add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS})
+add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS})
+add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS})
+
+# 2.basic ops not used in basic models
+add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
+add_operator(crop_op extra SRCS crop_op.cc DEPS ${op_DEPS})
+add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS})
+add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS})
+add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS})
+
+# 3.extra ops
+add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS})
+add_operator(lrn_op_lite extra SRCS lrn_op.cc DEPS ${op_DEPS})
+add_operator(decode_bboxes_op_lite extra SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
+add_operator(uniform_random_op extra SRCS uniform_random_op.cc DEPS ${op_DEPS})
+add_operator(axpy_op extra SRCS axpy_op.cc DEPS ${op_DEPS})
+add_operator(gru_unit_op extra SRCS gru_unit_op.cc DEPS ${op_DEPS})
+add_operator(gru_op extra SRCS gru_op.cc DEPS ${op_DEPS})
+add_operator(layout_once_op extra SRCS layout_once_op.cc DEPS ${op_DEPS})
+add_operator(density_prior_box_op basic SRCS density_prior_box_op.cc DEPS ${op_DEPS})
+add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS})
+add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS})
+add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS})
+add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS})
+add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
 add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS})
-add_operator(reduce_mean_op extra SRCS reduce_mean_op.cc DEPS ${op_DEPS})
-add_operator(stack_op extra SRCS stack_op.cc DEPS ${op_DEPS})
-add_operator(cast_op_lite extra SRCS cast_op.cc DEPS ${op_DEPS})
-add_operator(affine_channel_op extra SRCS affine_channel_op.cc DEPS ${op_DEPS})
 add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS})
 add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS})
 add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS})
 add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS})
-add_operator(flatten_op extra SRCS flatten_op.cc DEPS ${op_DEPS})
-add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
+add_operator(fake_quantize_range_abs_max_op basic SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS})
-add_operator(range_op extra SRCS range_op.cc DEPS ${op_DEPS})
 add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS})
 add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
 add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS})
+add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS})
+add_operator(merge_lod_tensor_op_lite extra SRCS merge_lod_tensor_op.cc DEPS ${op_DEPS})
+add_operator(reduce_prod_op_lite extra SRCS reduce_prod_op.cc DEPS ${op_DEPS})
 add_operator(sequence_reshape_op_lite extra SRCS sequence_reshape_op.cc DEPS ${op_DEPS})
 add_operator(sequence_reverse_op_lite extra SRCS sequence_reverse_op.cc DEPS ${op_DEPS})
+add_operator(sequence_pool extra SRCS sequence_pool_op.cc DEPS ${op_DEPS})
+add_operator(sequence_pool_concat extra SRCS sequence_pool_concat_op.cc DEPS ${op_DEPS})
 add_operator(reduce_sum_op_lite extra SRCS reduce_ops.cc DEPS ${op_DEPS})
 add_operator(match_matrix_tensor_op_lite extra SRCS match_matrix_tensor_op.cc DEPS ${op_DEPS})
 add_operator(search_seq_depadding_op_lite extra SRCS search_seq_depadding_op.cc DEPS ${op_DEPS})
@@ -89,13 +100,15 @@ add_operator(sequence_concat_op_lite extra SRCS sequence_concat_op.cc DEPS ${op_
 add_operator(var_conv_2d_op_lite extra SRCS var_conv_2d_op.cc DEPS ${op_DEPS})
 add_operator(attention_padding_mask_op_lite extra SRCS attention_padding_mask_op.cc DEPS ${op_DEPS})
 add_operator(sequence_arithmetic_op_lite extra SRCS sequence_arithmetic_op.cc DEPS ${op_DEPS})
+add_operator(conditional_block_op_lite extra SRCS conditional_block_op.cc DEPS ${op_DEPS})
+add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.cc DEPS ${op_DEPS})
+add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
 add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS})
 add_operator(lookup_table_v2_op extra SRCS lookup_table_v2_op.cc DEPS ${op_DEPS})
 add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
-add_operator(graph_op_lite extra SRCS graph_op.cc DEPS ${op_DEPS})
 add_operator(logical_xor  extra SRCS logical_op.cc DEPS ${op_DEPS})
 add_operator(logical_and  extra SRCS logical_op.cc DEPS ${op_DEPS})
 add_operator(logical_or  extra SRCS logical_op.cc DEPS ${op_DEPS})
@@ -108,7 +121,6 @@ add_operator(greater_than  extra SRCS compare_op.cc DEPS ${op_DEPS})
 add_operator(greater_equal  extra SRCS compare_op.cc DEPS ${op_DEPS})
 add_operator(read_from_array_op extra SRCS read_from_array_op.cc DEPS ${op_DEPS})
 add_operator(beam_search_op extra SRCS beam_search_op.cc DEPS ${op_DEPS})
-add_operator(sequence_pool extra SRCS sequence_pool_op.cc DEPS ${op_DEPS})
 add_operator(lod_reset_op extra SRCS lod_reset_op.cc DEPS ${op_DEPS})
 add_operator(is_empty extra SRCS is_empty_op.cc DEPS ${op_DEPS})
 add_operator(slice_op_lite basic SRCS slice_op.cc DEPS ${op_DEPS})
@@ -137,8 +149,8 @@ if (NOT LITE_WITH_X86)
     lite_cc_test(test_batch_norm_op SRCS batch_norm_op_test.cc DEPS batch_norm_op memory)
     lite_cc_test(test_concat_op SRCS concat_op_test.cc DEPS concat_op memory scope)
     lite_cc_test(test_calib_op SRCS calib_op_test.cc DEPS calib_op memory ARM_DEPS calib_compute_arm)
-    lite_cc_test(test_fusion_elementwise_activation_ops
-                SRCS fusion_elementwise_activation_ops_test.cc
-                DEPS fusion_elementwise_activation_ops memory)
     lite_cc_test(test_transpose_op SRCS transpose_op_test.cc DEPS transpose_op memory)
+    lite_cc_test(test_fusion_elementwise_activation_ops
+                 SRCS fusion_elementwise_activation_ops_test.cc
+                 DEPS fusion_elementwise_activation_ops memory)
 endif()
diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc
index c3c5de311f41f88fbeed4b03f9bfd618cf51c3b3..6292c5aef6d473d5e6ea34fd0102a2547f0c81d9 100644
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -117,8 +117,10 @@ REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp);
 
 #ifdef LITE_WITH_TRAIN
 REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp);
diff --git a/lite/operators/attention_padding_mask_op.cc b/lite/operators/attention_padding_mask_op.cc
index 1a48c5793910e909ddfb97332afc8960c3850c14..a88df0e7a902c6cac63eb77377bb0b49ee30c9b3 100644
--- a/lite/operators/attention_padding_mask_op.cc
+++ b/lite/operators/attention_padding_mask_op.cc
@@ -50,9 +50,9 @@ bool AttentionPaddingMaskOp::AttachImpl(const cpp::OpDesc &op_desc,
                                         lite::Scope *scope) {
   param_.X = scope->FindTensor(op_desc.Input("X").front());
   param_.Y = scope->FindTensor(op_desc.Input("Y").front());
-  param_.Out = scope->FindMutableTensor(op_desc.Input("Out").front());
+  param_.Out = scope->FindMutableTensor(op_desc.Output("Out").front());
   param_.pad_begin =
-      scope->FindMutableTensor(op_desc.Input("pad_begin").front());
+      scope->FindMutableTensor(op_desc.Output("pad_begin").front());
 
   param_.pad_id = op_desc.GetAttr<int>("pad_id");
   param_.mask = op_desc.GetAttr<float>("mask");
diff --git a/lite/operators/batch_norm_op.cc b/lite/operators/batch_norm_op.cc
index 6faa9eb225c76735460227b77387d0b0e8157525..76c257c6d34f0a82a920eaf49c1ef88efbd0daf4 100644
--- a/lite/operators/batch_norm_op.cc
+++ b/lite/operators/batch_norm_op.cc
@@ -82,7 +82,20 @@ bool BatchNormOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
   param_.variance =
       scope->FindVar(op_desc.Input("Variance").front())->GetMutable<Tensor>();
   param_.y = scope->FindVar(op_desc.Output("Y").front())->GetMutable<Tensor>();
-  param_.is_test = op_desc.GetAttr<int>("is_test");
+
+  auto is_test_type = op_desc.GetAttrType("is_test");
+  switch (is_test_type) {
+    case OpDescAPI::AttrType::INT:
+      param_.is_test = op_desc.GetAttr<int>("is_test");
+      break;
+    case OpDescAPI::AttrType::BOOLEAN:
+      param_.is_test = op_desc.GetAttr<bool>("is_test");
+      break;
+    default:
+      LOG(FATAL) << "Unsupported attribute type: the type of attribute "
+                    "`is_test` in BatchNormOP should be int or bool.";
+  }
+
   if (op_desc.HasAttr("use_global_stats")) {
     param_.use_global_stats = op_desc.GetAttr<bool>("use_global_stats");
   }
diff --git a/lite/operators/batch_norm_op_test.cc b/lite/operators/batch_norm_op_test.cc
index 574bb4cfd316b05bf08086d865f4eb7de7dd03a3..b79037c0bc9c3e9188eaf0e54b3f958960ab0893 100644
--- a/lite/operators/batch_norm_op_test.cc
+++ b/lite/operators/batch_norm_op_test.cc
@@ -46,7 +46,7 @@ TEST(batch_norm_op_lite, test) {
   desc.SetInput("Mean", {"mean"});
   desc.SetInput("Variance", {"variance"});
   desc.SetOutput("Y", {"y"});
-  desc.SetAttr("is_test", static_cast<int>(1));
+  desc.SetAttr("is_test", static_cast<bool>(true));
   desc.SetAttr("use_global_stats", false);
   desc.SetAttr("epsilon", 1e-5f);
   desc.SetAttr("momentum", 0.9f);
@@ -101,7 +101,7 @@ TEST(batch_norm_op_lite, test_enable_is_test) {
   desc.SetOutput("VarianceOut", {"variance_out"});
   desc.SetOutput("SavedMean", {"saved_mean"});
   desc.SetOutput("SavedVariance", {"saved_variance"});
-  desc.SetAttr("is_test", static_cast<int>(0));
+  desc.SetAttr("is_test", static_cast<bool>(false));
   desc.SetAttr("use_global_stats", false);
   desc.SetAttr("epsilon", 1e-5f);
   desc.SetAttr("momentum", 0.9f);
diff --git a/lite/operators/collect_fpn_proposals_op.cc b/lite/operators/collect_fpn_proposals_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4731d4bf81c241c6733b1403699874c1053d2b7f
--- /dev/null
+++ b/lite/operators/collect_fpn_proposals_op.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/collect_fpn_proposals_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool CollectFpnProposalsOpLite::CheckShape() const {
+  CHECK_OR_FALSE(!param_.multi_level_rois.empty());
+  CHECK_OR_FALSE(!param_.multi_level_scores.empty());
+  CHECK_OR_FALSE(param_.fpn_rois);
+
+  for (auto item : param_.multi_level_rois) {
+    auto dims = item->dims();
+    CHECK_OR_FALSE(dims[1] == 4);
+  }
+  for (auto item : param_.multi_level_scores) {
+    auto dims = item->dims();
+    CHECK_OR_FALSE(dims[1] == 1);
+  }
+  for (int i = 0; i < param_.multi_level_rois.size(); i++) {
+    auto roi = param_.multi_level_rois[i];
+    auto roi_lod = roi->lod();
+    auto score = param_.multi_level_scores[i];
+    auto score_lod = score->lod();
+    CHECK_OR_FALSE(roi_lod == score_lod);
+  }
+  return true;
+}
+
+bool CollectFpnProposalsOpLite::InferShape() const {
+  param_.fpn_rois->Resize({param_.post_nms_topN, 4});
+
+  return true;
+}
+
+bool CollectFpnProposalsOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                           lite::Scope* scope) {
+  auto rois_names = op_desc.Input("MultiLevelRois");
+  for (const auto& var_name : rois_names) {
+    param_.multi_level_rois.push_back(
+        scope->FindVar(var_name)->GetMutable<lite::Tensor>());
+  }
+  auto scores_names = op_desc.Input("MultiLevelScores");
+  for (const auto& var_name : scores_names) {
+    param_.multi_level_scores.push_back(
+        scope->FindVar(var_name)->GetMutable<lite::Tensor>());
+  }
+
+  auto fpn_rois = op_desc.Output("FpnRois").front();
+  param_.fpn_rois = scope->FindVar(fpn_rois)->GetMutable<lite::Tensor>();
+
+  param_.post_nms_topN = op_desc.GetAttr<int>("post_nms_topN");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(collect_fpn_proposals,
+                 paddle::lite::operators::CollectFpnProposalsOpLite);
diff --git a/lite/operators/collect_fpn_proposals_op.h b/lite/operators/collect_fpn_proposals_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ae7bb269ff53bb8add92d9afc8d462c45cb5f0b
--- /dev/null
+++ b/lite/operators/collect_fpn_proposals_op.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class CollectFpnProposalsOpLite : public OpLite {
+ public:
+  CollectFpnProposalsOpLite() {}
+
+  explicit CollectFpnProposalsOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "collect_fpn_proposals"; }
+
+ private:
+  mutable CollectFpnProposalsParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/compare_op.cc b/lite/operators/compare_op.cc
index 3210520cd5d71f239da258955df0e917e5e1153e..aa500ba35c37cf8af17091d8d37d8fd8d1a08e0e 100644
--- a/lite/operators/compare_op.cc
+++ b/lite/operators/compare_op.cc
@@ -54,7 +54,7 @@ bool CompareOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
 }  // namespace paddle
 
 REGISTER_LITE_OP(equal, paddle::lite::operators::CompareOp);
-REGISTER_LITE_OP(notequal, paddle::lite::operators::CompareOp);
+REGISTER_LITE_OP(not_equal, paddle::lite::operators::CompareOp);
 REGISTER_LITE_OP(less_than, paddle::lite::operators::CompareOp);
 REGISTER_LITE_OP(less_equal, paddle::lite::operators::CompareOp);
 REGISTER_LITE_OP(greater_than, paddle::lite::operators::CompareOp);
diff --git a/lite/operators/concat_op.cc b/lite/operators/concat_op.cc
index 1941a88bbf50fa0da9a8fd22db3fa9146d242957..b2f7438b64aa34787896839f020f0b056e6453fb 100644
--- a/lite/operators/concat_op.cc
+++ b/lite/operators/concat_op.cc
@@ -27,11 +27,8 @@ bool ConcatOpLite::CheckShape() const {
 }
 
 bool ConcatOpLite::InferShape() const {
-  std::vector<lite::DDim> input_dims;
-  for (auto p : param_.x) {
-    input_dims.push_back(p->dims());
-  }
-  const size_t n = input_dims.size();
+  const std::vector<Tensor *> &inputs = param_.x;
+  const size_t n = inputs.size();
   CHECK_GT_OR_FALSE(n, 0);
 
   int axis = 0;
@@ -42,17 +39,18 @@ bool ConcatOpLite::InferShape() const {
     axis = axis_tensor_val[0];
   }
   if (axis < 0) {
-    axis += input_dims[0].size();
+    axis += inputs[0]->dims().size();
   }
 
-  auto &out_dims = input_dims[0];
+  auto out_dims = inputs[0]->dims();
   size_t in_zero_dims_size = out_dims.size();
   for (size_t i = 1; i < n; i++) {
+    const auto &input_dims_i = inputs[i]->dims();
     for (size_t j = 0; j < in_zero_dims_size; j++) {
       if (j == static_cast<size_t>(axis)) {
-        out_dims[axis] += input_dims[i][j];
+        out_dims[axis] += input_dims_i[j];
       } else {
-        CHECK_EQ_OR_FALSE(out_dims[j], input_dims[i][j]);
+        CHECK_EQ_OR_FALSE(out_dims[j], input_dims_i[j]);
       }
     }
   }
@@ -60,7 +58,7 @@ bool ConcatOpLite::InferShape() const {
     out_dims[axis] = -1;
   }
   // Set output dims
-  param_.output->Resize(lite::DDim(out_dims));
+  param_.output->Resize(out_dims);
   auto out_lod = param_.output->mutable_lod();
   *out_lod = param_.x[0]->lod();
   return true;
diff --git a/lite/operators/conditional_block_op.cc b/lite/operators/conditional_block_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c79c4e20a29834e858bc670104e2a09e55888c85
--- /dev/null
+++ b/lite/operators/conditional_block_op.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conditional_block_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ConditionalBlockOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.cond);
+  CHECK_OR_FALSE(param_.sub_block);
+  CHECK_OR_FALSE(param_.scope);
+  return true;
+}
+
+bool ConditionalBlockOpLite::InferShape() const { return true; }
+
+bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                        lite::Scope *scope) {
+  auto condition = op_desc.Input("Cond").front();
+  param_.cond = scope->FindVar(condition)->GetMutable<lite::Tensor>();
+
+  auto inputs = op_desc.Input("Input");
+  for (auto var : inputs) {
+    param_.x.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  }
+
+  auto outs = op_desc.Output("Out");
+  for (auto var : outs) {
+    param_.outs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  }
+
+  param_.is_scalar_condition = op_desc.GetAttr<bool>("is_scalar_condition");
+  // obtain sub_block in core program.cc
+  param_.sub_block = sub_block_;
+  param_.scope = scope;
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(conditional_block,
+                 paddle::lite::operators::ConditionalBlockOpLite);
diff --git a/lite/operators/conditional_block_op.h b/lite/operators/conditional_block_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5518c255c5799aa5b44557a4493275794fd598f5
--- /dev/null
+++ b/lite/operators/conditional_block_op.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ConditionalBlockOpLite : public OpLite {
+ public:
+  ConditionalBlockOpLite() {}
+  explicit ConditionalBlockOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "conditional_block"; }
+
+  void SetSubBlock(cpp::BlockDesc *desc) { sub_block_ = desc; }
+
+ private:
+  mutable ConditionalBlockParam param_;
+  cpp::BlockDesc *sub_block_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
index ceca1a61ce3457ed0a2c25541d02bd868c380b3b..9ae52d1cb6a406dc8d1059ad97f3757dbc0a31fa 100644
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -39,30 +39,37 @@ bool ConvOpLite::CheckShape() const {
   return true;
 }
 
-inline int ConvOutputSize(
-    int input_size, int filter_size, int dilation, int padding, int stride) {
+inline int ConvOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int pad_left,
+                          int pad_right,
+                          int stride) {
   const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  // CHECK_GT_OR_FALSE(output_size, 0);
+  int output_size =
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
 
   return output_size;
 }
 
-inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
-                                     std::vector<int>* dilations,
-                                     const std::vector<int>& strides,
-                                     const std::string padding_algorithm,
-                                     const lite::DDim data_dims,
-                                     const lite::DDim& ksize) {
+void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                              std::vector<int>* dilations,
+                              const std::vector<int>& strides,
+                              const std::string padding_algorithm,
+                              const lite::DDim data_dims,
+                              const lite::DDim& ksize) {
   // when padding_desc is "VALID" or "SAME"
   if (padding_algorithm == "SAME") {
     for (size_t i = 0; i < strides.size(); ++i) {
       int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
-      int pad_sum =
-          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
-                   (int64_t)0);
+      int pad_sum = std::max(
+          (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
+          (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
       // pad
-      *(paddings->begin() + i) = pad_sum / 2;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
       // dilation
       *(dilations->begin() + i) = 1;
     }
@@ -77,18 +84,21 @@ bool ConvOpLite::InferShape() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
 
-  UpdatePaddingAndDilation(&param_.paddings,
-                           &param_.dilations,
+  UpdatePaddingAndDilation(param_.paddings.get(),
+                           param_.dilations.get(),
                            param_.strides,
                            padding_algorithm_,
                            in_dims,
                            filter_dims);
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  auto paddings = *param_.paddings;
+  auto dilations = *param_.dilations;
   for (size_t i = 0; i < param_.strides.size(); ++i) {
     output_shape.push_back(ConvOutputSize(in_dims[i + 2],
                                           filter_dims[i + 2],
-                                          param_.dilations[i],
-                                          param_.paddings[i],
+                                          dilations[i],
+                                          paddings[i * 2],
+                                          paddings[i * 2 + 1],
                                           param_.strides[i]));
   }
 
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index 1d6e1c93490a394723d34de76fc3ff8040d31e81..63107022f1ef69a21d37373c4a257625f8b0f5e3 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/kernel.h"
@@ -47,9 +48,10 @@ class ConvOpLite : public OpLite {
     param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
 
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
     param_.groups = op_desc.GetAttr<int>("groups");
-    param_.dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+    auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+    param_.dilations = std::make_shared<std::vector<int>>(dilations);
 
     // optional params
     std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
@@ -83,6 +85,10 @@ class ConvOpLite : public OpLite {
       if (act_type == "relu") {
         param_.activation_param.active_type = lite_api::ActivationType::kRelu;
         param_.fuse_relu = true;
+      } else if (act_type == "relu6") {
+        param_.activation_param.active_type = lite_api::ActivationType::kRelu6;
+        param_.activation_param.Relu_clipped_coef =
+            op_desc.GetAttr<float>("fuse_brelu_threshold");  // 6.f
       } else if (act_type == "leaky_relu") {
         param_.activation_param.active_type =
             lite_api::ActivationType::kLeakyRelu;
@@ -109,6 +115,20 @@ class ConvOpLite : public OpLite {
         param_.output_scale = op_desc.GetAttr<float>("output_scale");
       }
     }
+
+    // 2-pad to 4-pad
+    if (paddings.size() == 2L) {
+      for (size_t i = 0; i < param_.strides.size(); ++i) {
+        int copy_pad = *(paddings.begin() + 2 * i);
+        paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+      }
+    } else {
+      if (paddings.size() != 4L) {
+        LOG(FATAL)
+            << "Paddings size should be the same or twice as the input size.";
+      }
+    }
+    param_.paddings = std::make_shared<std::vector<int>>(paddings);
     return true;
   }
 
@@ -120,7 +140,13 @@ class ConvOpLite : public OpLite {
   mutable ConvParam param_;
   std::string padding_algorithm_{""};
 };
-
+// update padding dilation
+void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                              std::vector<int>* dilations,
+                              const std::vector<int>& strides,
+                              const std::string padding_algorithm,
+                              const lite::DDim data_dims,
+                              const lite::DDim& ksize);
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc
index fb6b431fff8ab20dd1a6d1abc8aff7443771ee2f..a84b975492040ec0bdc1326f33f8b7edafdea2bb 100644
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "lite/operators/conv_transpose_op.h"
+#include <memory>
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
+#include "lite/operators/conv_op.h"
 
 namespace paddle {
 namespace lite {
@@ -32,24 +34,60 @@ bool ConvTransposeOpLite::CheckShape() const {
 
   CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size());
   CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U);
-  CHECK_EQ_OR_FALSE(param_.paddings.size(), param_.strides.size());
 
   CHECK_OR_FALSE(in_dims[1] % param_.groups == 0);
+  CHECK_EQ_OR_FALSE(filter_dims.size(), 4UL);
   return true;
 }
 
+inline int ConvTransposeOutputSize(int input_size,
+                                   int filter_size,
+                                   int dilation,
+                                   int pad_left,
+                                   int pad_right,
+                                   int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size - 1) * stride - pad_left - pad_right + dkernel;
+
+  return output_size;
+}
+
 bool ConvTransposeOpLite::InferShape() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
 
+  UpdatePaddingAndDilation(param_.paddings.get(),
+                           param_.dilations.get(),
+                           param_.strides,
+                           padding_algorithm_,
+                           in_dims,
+                           filter_dims);
+  auto paddings = *param_.paddings;
+  auto dilations = *param_.dilations;
+
   std::vector<int64_t> output_shape;
   output_shape.push_back(in_dims[0]);
   output_shape.push_back(filter_dims[1] * param_.groups);
-  for (int i = 0; i < param_.strides.size(); i++) {
-    int kernel_extent = param_.dilations[i] * (filter_dims[i + 2] - 1) + 1;
-    int output_len = (in_dims[i + 2] - 1) * param_.strides[i] + kernel_extent -
-                     2 * param_.paddings[i];
-    output_shape.push_back(output_len);
+  for (size_t i = 0; i < param_.strides.size(); ++i) {
+    output_shape.push_back(ConvTransposeOutputSize(in_dims[i + 2],
+                                                   filter_dims[i + 2],
+                                                   dilations[i],
+                                                   paddings[i * 2],
+                                                   paddings[i * 2 + 1],
+                                                   param_.strides[i]));
+  }
+  if (!param_.output_size.empty()) {
+    for (size_t i = 0; i < param_.output_size.size(); ++i) {
+      CHECK_LT(param_.output_size[i], output_shape[i + 2] + param_.strides[i])
+          << "set output_size error, the output_size should less than "
+          << output_shape[i + 2] + param_.strides[i] << ", but the value is "
+          << param_.output_size[i];
+      CHECK_GE(param_.output_size[i], output_shape[i + 2])
+          << "set output_size error, the output_size should greater than or "
+          << "equal to " << output_shape[i + 2] << ", but the value is "
+          << param_.output_size[i];
+      output_shape[i + 2] = param_.output_size[i];
+    }
   }
 
   // Set output dims
@@ -58,8 +96,8 @@ bool ConvTransposeOpLite::InferShape() const {
 }
 
 // TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                     lite::Scope *scope) {
+bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                     lite::Scope* scope) {
   auto X = op_desc.Input("Input").front();
   auto Filter = op_desc.Input("Filter").front();
   auto Out = op_desc.Output("Output").front();
@@ -68,9 +106,27 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
   param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
 
   param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-  param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
   param_.groups = op_desc.GetAttr<int>("groups");
-  param_.dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+  auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+
+  if (op_desc.HasAttr("padding_algorithm")) {
+    padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
+  }
+  // 2-pad to 4-pad
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    if (paddings.size() != 4L) {
+      LOG(FATAL)
+          << "Paddings size should be the same or twice as the input size.";
+    }
+  }
+  param_.paddings = std::make_shared<std::vector<int>>(paddings);
+  param_.dilations = std::make_shared<std::vector<int>>(dilations);
 
   // optional params
   std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
@@ -81,12 +137,16 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
       auto bias_var = scope->FindVar(bias_arguments.front());
       if (bias_var != nullptr) {
         param_.bias =
-            const_cast<lite::Tensor *>(&(bias_var->Get<lite::Tensor>()));
+            const_cast<lite::Tensor*>(&(bias_var->Get<lite::Tensor>()));
       }
     }
   }
   if (op_desc.HasAttr("fuse_relu")) {
     param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
+    param_.activation_param.active_type = lite_api::ActivationType::kRelu;
+  }
+  if (op_desc.HasAttr("output_size")) {
+    param_.output_size = op_desc.GetAttr<std::vector<int>>("output_size");
   }
   return true;
 }
diff --git a/lite/operators/conv_transpose_op.h b/lite/operators/conv_transpose_op.h
index d8b64c78efdcc00b5842c90336ce195b55d59370..fb25c022f974ad195bf72b19cb9b459b2d11d5f2 100644
--- a/lite/operators/conv_transpose_op.h
+++ b/lite/operators/conv_transpose_op.h
@@ -44,6 +44,7 @@ class ConvTransposeOpLite : public OpLite {
 
  private:
   mutable ConvParam param_;
+  std::string padding_algorithm_{""};
 };
 
 }  // namespace operators
diff --git a/lite/operators/distribute_fpn_proposals_op.cc b/lite/operators/distribute_fpn_proposals_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d6a0fca923dd38fd456e024ec14ba7c2685163d
--- /dev/null
+++ b/lite/operators/distribute_fpn_proposals_op.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/distribute_fpn_proposals_op.h"
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool DistributeFpnProposalsOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.fpn_rois);
+  CHECK_OR_FALSE(param_.restore_index);
+  CHECK_OR_FALSE(param_.multi_fpn_rois.size() > 1);
+  CHECK_OR_FALSE(param_.max_level >= param_.min_level);
+  size_t num_out_rois =
+      static_cast<size_t>(param_.max_level - param_.min_level + 1);
+  CHECK_OR_FALSE(num_out_rois == param_.multi_fpn_rois.size());
+  return true;
+}
+
+bool DistributeFpnProposalsOpLite::InferShape() const {
+  int num_out_rois = param_.max_level - param_.min_level + 1;
+  for (int i = 0; i < num_out_rois; i++) {
+    param_.multi_fpn_rois[i]->Resize({-1, 4});
+  }
+  param_.restore_index->Resize({-1, 1});
+  return true;
+}
+
+bool DistributeFpnProposalsOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                              lite::Scope *scope) {
+  auto fpn_rois = op_desc.Input("FpnRois").front();
+  param_.fpn_rois = scope->FindVar(fpn_rois)->GetMutable<lite::Tensor>();
+
+  auto multi_fpn_rois = op_desc.Output("MultiFpnRois");
+  for (const auto &name : multi_fpn_rois) {
+    param_.multi_fpn_rois.push_back(
+        scope->FindVar(name)->GetMutable<lite::Tensor>());
+  }
+  auto restore_index = op_desc.Output("RestoreIndex").front();
+  param_.restore_index =
+      scope->FindVar(restore_index)->GetMutable<lite::Tensor>();
+  param_.min_level = op_desc.GetAttr<int>("min_level");
+  param_.max_level = op_desc.GetAttr<int>("max_level");
+  param_.refer_level = op_desc.GetAttr<int>("refer_level");
+  param_.refer_scale = op_desc.GetAttr<int>("refer_scale");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(distribute_fpn_proposals,
+                 paddle::lite::operators::DistributeFpnProposalsOpLite);
diff --git a/lite/operators/distribute_fpn_proposals_op.h b/lite/operators/distribute_fpn_proposals_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2390e329329f7406f05ba69b3768556f94a02bec
--- /dev/null
+++ b/lite/operators/distribute_fpn_proposals_op.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class DistributeFpnProposalsOpLite : public OpLite {
+ public:
+  DistributeFpnProposalsOpLite() {}
+
+  explicit DistributeFpnProposalsOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "distribute_fpn_proposals";
+  }
+
+ private:
+  mutable DistributeFpnProposalsParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/dropout_op.cc b/lite/operators/dropout_op.cc
index bef089184751342545d56f6b16ed8554be775fae..03047de3b318ee2221809ee602d94f204568d723 100644
--- a/lite/operators/dropout_op.cc
+++ b/lite/operators/dropout_op.cc
@@ -33,7 +33,7 @@ bool DropoutOp::InferShape() const {
     param_.mask->Resize(x_dims);
   }
   // share LoD
-  // param_.output->set_lod(param_.input->lod());
+  param_.output->set_lod(param_.x->lod());
   return true;
 }
 
diff --git a/lite/operators/fake_quantize_range_abs_max.cc b/lite/operators/fake_quantize_range_abs_max.cc
index a8ce3f75a59fec5b032c60f51177f428bd15fe0d..ebf7e41f4b1af6f6961da07fe95caece19fa59f5 100644
--- a/lite/operators/fake_quantize_range_abs_max.cc
+++ b/lite/operators/fake_quantize_range_abs_max.cc
@@ -23,3 +23,5 @@ namespace operators {}  // namespace operators
 
 REGISTER_LITE_OP(fake_quantize_range_abs_max,
                  paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite);
+REGISTER_LITE_OP(fake_quantize_abs_max,
+                 paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite);
diff --git a/lite/operators/fake_quantize_range_abs_max.h b/lite/operators/fake_quantize_range_abs_max.h
index 726731595a9c4b7cd2e30db911230cc2f00b5b92..f68d1e20f6e60bb5aa99a2402ea8c9f88aa18470 100644
--- a/lite/operators/fake_quantize_range_abs_max.h
+++ b/lite/operators/fake_quantize_range_abs_max.h
@@ -40,13 +40,15 @@ class FakeQuantizeRangeMaxAbsOpLite : public OpLite {
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
-    auto in_scale = op_desc.Input("InScale").front();
+    if (op_desc.HasInput("InScale")) {
+      auto in_scale = op_desc.Input("InScale").front();
+      param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
+    }
 
     auto out = op_desc.Output("Out").front();
     auto out_scale = op_desc.Output("OutScale").front();
 
     param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
-    param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
 
     param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
     param_.out_scale = scope->FindVar(out_scale)->GetMutable<lite::Tensor>();
diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc
index 3f2a69dfbc76a3a7c0bdcac69866b901b239d1e4..eff9300fea4caf412186bfc8d0ad136686507be5 100644
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -27,21 +27,21 @@ bool FcOpLite::CheckShape() const {
 
   const auto input_dims = param_.input->dims();
   const auto w_dims = param_.w->dims();
+  CHECK_EQ_OR_FALSE(w_dims.size(), 2UL);
 
+  int64_t w_dims_1 = param_.padding_weights ? w_dims[1] - 4 : w_dims[1];
   if (param_.bias) {
     const auto bias_dims = param_.bias->dims();
     if (bias_dims.size() == 2) {
       CHECK_EQ_OR_FALSE(bias_dims[0], 1);
-      CHECK_EQ_OR_FALSE(bias_dims[1], w_dims[1]);
+      CHECK_EQ_OR_FALSE(bias_dims[1], w_dims_1);
     } else if (bias_dims.size() == 1) {
-      CHECK_EQ_OR_FALSE(bias_dims[0], w_dims[1]);
+      CHECK_EQ_OR_FALSE(bias_dims[0], w_dims_1);
     }
   }
 
-  CHECK_EQ_OR_FALSE(w_dims.size(), 2UL);
   CHECK_GT_OR_FALSE(input_dims.size(),
                     static_cast<size_t>(param_.in_num_col_dims));
-
   param_.in_mat_dims = input_dims.Flatten2D(param_.in_num_col_dims);
   // CHECK_EQ_OR_FALSE(param_.in_mat_dims[1], w_dims[0]);
 
@@ -49,23 +49,25 @@ bool FcOpLite::CheckShape() const {
 }
 
 bool FcOpLite::InferShape() const {
-  const auto input_dims = param_.input->dims();
-  const auto w_dims = param_.w->dims();
+  const auto& input_dims = param_.input->dims();
+  const auto& w_dims = param_.w->dims();
+  int in_num_col_dims = param_.in_num_col_dims;
+  int64_t w_dims_1 = param_.padding_weights ? w_dims[1] - 4 : w_dims[1];
 
   // Set output dims
-  std::vector<int64_t> output_dims(param_.in_num_col_dims + 1, 0);
-  for (int i = 0; i < param_.in_num_col_dims; ++i) {
+  std::vector<DDim::value_type> output_dims(in_num_col_dims + 1);
+  for (int i = 0; i < in_num_col_dims; ++i) {
     output_dims[i] = input_dims[i];
   }
-  output_dims.back() = w_dims[1];
-  param_.output->Resize(lite::DDim(output_dims));
+  output_dims[in_num_col_dims] = w_dims_1;
+  param_.output->Resize(output_dims);
 
   // share LoD
-  // param_.output->set_lod(param_.input->lod());
+  param_.output->set_lod(param_.input->lod());
   return true;
 }
 
-bool FcOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+bool FcOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
   auto input = op_desc.Input("Input").front();
   auto W = op_desc.Input("W").front();
   auto out = op_desc.Output("Out").front();
@@ -90,6 +92,11 @@ bool FcOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
   if (op_desc.HasAttr("activation_type")) {
     param_.activation_type = op_desc.GetAttr<std::string>("activation_type");
   }
+  if (op_desc.HasAttr("padding_weights")) {
+    param_.padding_weights = op_desc.GetAttr<bool>("padding_weights");
+  } else {
+    param_.padding_weights = false;
+  }
 
   // For Int8
   if (op_desc.HasAttr("enable_int8")) {
diff --git a/lite/operators/fc_op.h b/lite/operators/fc_op.h
index 3cddde38b291f189649175a43c994d4fcfcabb9b..ec449cd4bdc33f191c33fc04f215ad672b283215 100644
--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
@@ -37,15 +37,6 @@ class FcOpLite : public OpLite {
 
   bool InferShape() const override;
 
-  /*
-  bool Run() override {
-    CHECK(kernel_);
-    kernel_->Run();
-    return true;
-  }
-   */
-
-  // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
diff --git a/lite/operators/fill_constant_op.cc b/lite/operators/fill_constant_op.cc
index 6e4bee4da87095245d90c6af5db98d2e95d7d3d8..bd4b483e9ed20f89bf2d072ca21bdc24a0e82256 100644
--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
@@ -29,6 +29,12 @@ class FillConstantOp : public OpLite {
   }
 
   bool InferShape() const override {
+    lite::Tensor* shape_tensor_ = param_.shape_tensor;
+    if (param_.shape.empty() && shape_tensor_ != nullptr) {
+      param_.Out->Resize(shape_tensor_->dims());
+      return true;
+    }
+
     param_.Out->Resize(param_.shape);
     return true;
   }
@@ -41,6 +47,22 @@ class FillConstantOp : public OpLite {
     param_.shape = opdesc.GetAttr<std::vector<int64_t>>("shape");
     param_.value = opdesc.GetAttr<float>("value");
     param_.force_cpu = opdesc.GetAttr<bool>("force_cpu");
+    param_.shape_tensor = nullptr;
+    param_.shape_tensor_list = {};
+
+    std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
+    if (opdesc.HasInput("ShapeTensor") &&
+        !opdesc.Input("ShapeTensor").empty()) {
+      auto args = opdesc.Input("ShapeTensor");
+      auto* var = scope->FindVar(args.front());
+      param_.shape_tensor = var->GetMutable<lite::Tensor>();
+    }
+    if (opdesc.HasAttr("ShapeTensorList")) {
+      auto args = opdesc.Input("ShapeTensorList");
+      auto* var = scope->FindVar(args.front());
+      param_.shape_tensor_list =
+          *(var->GetMutable<std::vector<lite::Tensor*>>());
+    }
     return true;
   }
 
diff --git a/lite/operators/graph_op.cc b/lite/operators/graph_op.cc
deleted file mode 100644
index 018ce264e2f18862549a4abc0444d02dcbb573ee..0000000000000000000000000000000000000000
--- a/lite/operators/graph_op.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/graph_op.h"
-#include <utility>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool GraphOpLite::CheckShape() const {
-  CHECK_GE_OR_FALSE(param_.inputs.size(), 1UL);
-  CHECK_GE_OR_FALSE(param_.outputs.size(), 1UL);
-  return true;
-}
-
-bool GraphOpLite::InferShape() const { return CheckShape(); /* enrich me */ }
-
-bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto inputs = op_desc.Input("Inputs");
-  auto weight = op_desc.Input("Weight");
-  auto outputs = op_desc.Output("Outputs");
-
-  for (auto var : inputs) {
-    CHECK(scope->FindVar(var));
-    param_.inputs.push_back(
-        std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
-  }
-
-  param_.weight = scope->FindVar(weight.front())->GetMutable<lite::Tensor>();
-  CHECK(param_.weight);
-
-  for (auto var : outputs) {
-    CHECK(scope->FindVar(var));
-    param_.outputs.push_back(
-        std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
-  }
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(graph_op, paddle::lite::operators::GraphOpLite);
diff --git a/lite/operators/grid_sampler_op.cc b/lite/operators/grid_sampler_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b13d17da7c439f582f682a74b1590cda632cf78
--- /dev/null
+++ b/lite/operators/grid_sampler_op.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/grid_sampler_op.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool GridSamplerOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.out);
+  CHECK_OR_FALSE(param_.grid);
+  auto x_dims = param_.x->dims();
+  auto grid_dims = param_.grid->dims();
+
+  CHECK_EQ(x_dims.size(), 4UL) << "Input must have 4 dimensions.";
+  CHECK_EQ(grid_dims.size(), 4UL) << "Grid must have 4 dimensions.";
+  CHECK_EQ(grid_dims[0], x_dims[0])
+      << "Input(X) dims[0] and Input(Grid) dims[0] should be equal.";
+  CHECK_EQ(grid_dims[1], x_dims[2])
+      << "Input(X) dims[2] and Input(Grid) dims[1] should be equal.";
+  CHECK_EQ(grid_dims[2], x_dims[3])
+      << "Input(X) dims[3] and Input(Grid) dims[2] should be equal.";
+
+  return true;
+}
+
+bool GridSamplerOp::InferShape() const {
+  auto x_dims = param_.x->dims();
+  param_.out->Resize(x_dims);
+  return true;
+}
+
+bool GridSamplerOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
+  param_.grid =
+      scope->FindVar(op_desc.Input("Grid").front())->GetMutable<Tensor>();
+  param_.out =
+      scope->FindVar(op_desc.Output("Output").front())->GetMutable<Tensor>();
+  return true;
+}
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
+
+REGISTER_LITE_OP(grid_sampler, paddle::lite::operators::GridSamplerOp);
diff --git a/lite/operators/grid_sampler_op.h b/lite/operators/grid_sampler_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..035e1b834510affefacafad763d75d6fbf53aed9
--- /dev/null
+++ b/lite/operators/grid_sampler_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class GridSamplerOp : public OpLite {
+ public:
+  GridSamplerOp() {}
+
+  explicit GridSamplerOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "grid_sampler"; }
+
+ private:
+  mutable GridSamplerParam param_;
+};
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
diff --git a/lite/operators/gru_op.cc b/lite/operators/gru_op.cc
index 3ddeb5b73463ab7502e81ce82e348c08e23dc421..eb97d65a1a213e31b23087d1ca5c8e963ecf9bbb 100644
--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
@@ -28,8 +28,8 @@ bool GRUOpLite::CheckShape() const {
   CHECK_OR_FALSE(param_.batch_hidden)
   CHECK_OR_FALSE(param_.hidden)
 
-  auto input_dims = param_.input->dims();
-  auto weight_dims = param_.weight->dims();
+  const auto& input_dims = param_.input->dims();
+  const auto& weight_dims = param_.weight->dims();
   int input_size = input_dims[1];
   int frame_size = weight_dims[0];
   CHECK_EQ_OR_FALSE(input_size, frame_size * 3)
@@ -52,21 +52,23 @@ bool GRUOpLite::CheckShape() const {
 }
 
 bool GRUOpLite::InferShape() const {
-  auto input_dims = param_.input->dims();
-  auto weight_dims = param_.weight->dims();
+  const auto& input_dims = param_.input->dims();
+  const auto& weight_dims = param_.weight->dims();
   int frame_size = weight_dims[0];
   auto batch_size = input_dims[0];
 
   param_.batch_gate->Resize(input_dims);
-  param_.batch_reset_hidden_prev->Resize(lite::DDim({batch_size, frame_size}));
-  param_.batch_hidden->Resize(lite::DDim({batch_size, frame_size}));
-  param_.hidden->Resize(lite::DDim({batch_size, frame_size}));
+
+  DDim out_dims({batch_size, frame_size});
+  param_.batch_reset_hidden_prev->Resize(out_dims);
+  param_.batch_hidden->Resize(out_dims);
+  param_.hidden->Resize(out_dims);
 
   *(param_.hidden->mutable_lod()) = param_.input->lod();
   return true;
 }
 
-bool GRUOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+bool GRUOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
   auto input = op_desc.Input("Input").front();
   auto weight = op_desc.Input("Weight").front();
   auto batch_gate = op_desc.Output("BatchGate").front();
diff --git a/lite/operators/instance_norm_op.cc b/lite/operators/instance_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..510402ba1fb363f383b3cba8eb322a4ff7975c18
--- /dev/null
+++ b/lite/operators/instance_norm_op.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/instance_norm_op.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool InstanceNormOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.scale);
+  CHECK_OR_FALSE(param_.bias);
+  CHECK_OR_FALSE(param_.out);
+  CHECK_OR_FALSE(param_.saved_mean);
+  CHECK_OR_FALSE(param_.saved_variance);
+  auto x_dims = param_.x->dims();
+  auto scale_dims = param_.scale->dims();
+  auto bias_dims = param_.bias->dims();
+  CHECK(x_dims.size() >= 2 && x_dims.size() <= 5)
+      << "Input X must have 2 to 5 dimensions.";
+  CHECK_EQ(scale_dims.size(), 1UL) << "Input Scale must have 1 dimensions.";
+  CHECK_EQ(bias_dims.size(), 1UL) << "Input Bias must have 1 dimensions.";
+  CHECK_GT(param_.epsilon, 0.f) << "epsilon should be greater than 0.f";
+  CHECK_LT(param_.epsilon, 0.01f) << "epsilon should be less than 0.01f";
+  return true;
+}
+
+bool InstanceNormOp::InferShape() const {
+  auto x_dims = param_.x->dims();
+  int64_t batch_size = x_dims[0];
+  int64_t channel_size = x_dims[1];
+  param_.saved_mean->Resize({batch_size * channel_size});
+  param_.saved_variance->Resize({batch_size * channel_size});
+  param_.out->Resize(x_dims);
+  return true;
+}
+
+bool InstanceNormOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                lite::Scope* scope) {
+  param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
+  param_.scale =
+      scope->FindVar(op_desc.Input("Scale").front())->GetMutable<Tensor>();
+  param_.bias =
+      scope->FindVar(op_desc.Input("Bias").front())->GetMutable<Tensor>();
+  param_.saved_mean =
+      scope->FindVar(op_desc.Output("SavedMean").front())->GetMutable<Tensor>();
+  param_.saved_variance =
+      scope->FindVar(op_desc.Output("SavedVariance").front())
+          ->GetMutable<Tensor>();
+  param_.out =
+      scope->FindVar(op_desc.Output("Y").front())->GetMutable<Tensor>();
+  param_.epsilon = op_desc.GetAttr<float>("epsilon");
+  return true;
+}
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
+
+REGISTER_LITE_OP(instance_norm, paddle::lite::operators::InstanceNormOp);
diff --git a/lite/operators/instance_norm_op.h b/lite/operators/instance_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d128345805cf77ac2a4123a8549c92051593fff0
--- /dev/null
+++ b/lite/operators/instance_norm_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class InstanceNormOp : public OpLite {
+ public:
+  InstanceNormOp() {}
+
+  explicit InstanceNormOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "instance_norm"; }
+
+ private:
+  mutable InstanceNormParam param_;
+};
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
diff --git a/lite/operators/interpolate_op.cc b/lite/operators/interpolate_op.cc
index 936da73d89007f4f6dd36fa770df537996c40a51..1bfb20df4e4b9762e93b6a39f0d34eb2521acfe0 100644
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
@@ -35,8 +35,7 @@ bool InterpolateOp::CheckShape() const {
 }
 
 bool InterpolateOp::InferShape() const {
-  auto* X = param_.X;
-  auto* OutSize = param_.OutSize;
+  auto X = param_.X;
 
   int n = X->dims()[0];
   int c = X->dims()[1];
@@ -46,39 +45,40 @@ bool InterpolateOp::InferShape() const {
   int out_w;
 
   auto SizeTensor = param_.SizeTensor;
+  auto OutSize = param_.OutSize;
+  auto Scale = param_.Scale;
   if (!SizeTensor.empty()) {
-    CHECK(SizeTensor.size() == 2)
+    CHECK_EQ(SizeTensor.size(), 2)
         << "Input(SizeTensor)'size of Op(interpolate) must be 2. "
            "Attr(out_shape)'s length must be 2 for 4-D input tensor.";
+    out_h = SizeTensor[0]->data<int>()[0];
+    out_w = SizeTensor[1]->data<int>()[0];
+  } else if (OutSize) {
+    auto OutSize_dims = OutSize->dims();
+    CHECK_EQ(OutSize_dims.size(), 1) << "Input(OutSize)'s dims size must be 1";
+    CHECK_EQ(OutSize_dims[0], 2) << "OutSize's dim[0] must be 2";
+    auto OutSize_data = OutSize->data<int>();
+    out_h = OutSize_data[0];
+    out_w = OutSize_data[1];
+  } else if (param_.out_h > 0 && param_.out_w > 0) {
     out_h = param_.out_h;
     out_w = param_.out_w;
-    param_.Out->Resize({n, c, out_h, out_w});
-    return true;
-  }
-
-  auto Scale = param_.Scale;
-  if (Scale) {
-    auto scale_dims = Scale->dims();
-    CHECK(scale_dims.size() == 1) << "Scale's dimension size must be 1.";
-    out_h = -1;
-    out_w = -1;
   } else {
-    auto scale = param_.scale;
-    if (scale > 0) {
-      out_h = static_cast<int>(h * scale);
-      out_w = static_cast<int>(w * scale);
-      out_h = out_h > 0 ? out_h : -1;
-      out_w = out_w > 0 ? out_w : -1;
+    float scale = -1.f;
+    if (Scale) {
+      auto Scale_dims = Scale->dims();
+      CHECK_EQ(Scale_dims.size(), 1) << "Scale's dimension size must be 1.";
+      scale = Scale->data<float>()[0];
     } else {
-      out_h = param_.out_h;
-      out_w = param_.out_w;
+      scale = param_.scale;
     }
+    CHECK(scale > 0) << "scale must large than 0.";
+    out_h = static_cast<int>(h * scale);
+    out_w = static_cast<int>(w * scale);
   }
 
-  if (OutSize != nullptr) {
-    auto out_lod = param_.Out->mutable_lod();
-    *out_lod = param_.X->lod();
-  }
+  auto out_lod = param_.Out->mutable_lod();
+  *out_lod = param_.X->lod();
   param_.Out->Resize({n, c, out_h, out_w});
 
   return true;
diff --git a/lite/operators/layer_norm_op.cc b/lite/operators/layer_norm_op.cc
index 061355733c9a6722fcca4ba01af81981d2b5c9ac..18ea6cbf281846600273d6e7d462ed43f2e45637 100644
--- a/lite/operators/layer_norm_op.cc
+++ b/lite/operators/layer_norm_op.cc
@@ -30,7 +30,7 @@ bool LayerNormOp::CheckShape() const {
 bool LayerNormOp::InferShape() const {
   auto out_dims = param_.X->dims();
   param_.Y->Resize(out_dims);
-  auto inner_size = out_dims.Flatten2D(param_.begin_norm_axis)[1];
+  auto inner_size = out_dims.Flatten2D(param_.begin_norm_axis)[0];
   param_.Mean->Resize(std::vector<int64_t>({inner_size}));
   param_.Variance->Resize(std::vector<int64_t>({inner_size}));
 
diff --git a/lite/operators/lookup_table_op.cc b/lite/operators/lookup_table_op.cc
index 3d5a71cee96adb520aeafc83156e5f37638912ad..931894d925aa9e66b34b3577304828424bfd194e 100644
--- a/lite/operators/lookup_table_op.cc
+++ b/lite/operators/lookup_table_op.cc
@@ -25,8 +25,8 @@ bool LookupTableOpLite::CheckShape() const {
   CHECK_OR_FALSE(param_.Ids)
   CHECK_OR_FALSE(param_.Out)
 
-  auto table_dims = param_.W->dims();
-  auto ids_dims = param_.Ids->dims();
+  const auto& table_dims = param_.W->dims();
+  const auto& ids_dims = param_.Ids->dims();
 
   int ids_rank = ids_dims.size();
 
@@ -37,25 +37,20 @@ bool LookupTableOpLite::CheckShape() const {
 }
 
 bool LookupTableOpLite::InferShape() const {
-  auto table_dims = param_.W->dims();
-  auto ids_dims = param_.Ids->dims();
+  const auto& table_dims = param_.W->dims();
+  const auto& ids_dims = param_.Ids->dims();
 
+  auto out_dims = ids_dims;
   int ids_rank = ids_dims.size();
+  out_dims[ids_rank - 1] = table_dims[1];
 
-  auto output_dims = ids_dims.Slice(0, ids_rank - 1);
-
-  std::vector<int64_t> out_dims;
-  for (int i = 0; i < ids_rank - 1; ++i) {
-    out_dims.push_back(ids_dims[i]);
-  }
-  out_dims.push_back(table_dims[1]);
-  param_.Out->Resize(lite::DDim{out_dims});
+  param_.Out->Resize(out_dims);
   param_.Out->set_lod(param_.Ids->lod());
   return true;
 }
 
-bool LookupTableOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                   lite::Scope *scope) {
+bool LookupTableOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                   lite::Scope* scope) {
   auto input = op_desc.Input("W").front();
   auto ids = op_desc.Input("Ids").front();
   auto out = op_desc.Output("Out").front();
diff --git a/lite/operators/match_matrix_tensor_op.cc b/lite/operators/match_matrix_tensor_op.cc
index 8efc8866d97297cc630b81432e70942b851325bb..a8095a94bf75cd5d6d9087509449c159056ebc28 100644
--- a/lite/operators/match_matrix_tensor_op.cc
+++ b/lite/operators/match_matrix_tensor_op.cc
@@ -35,6 +35,7 @@ bool MatchMatrixTensorOpLite::CheckShape() const {
   CHECK_OR_FALSE(x_dims.size() == 2);
   CHECK_OR_FALSE(y_dims.size() == 2);
   CHECK_OR_FALSE(w_dims.size() == 3);
+
   CHECK_OR_FALSE(x_dims[1] == w_dims[0] && y_dims[1] == w_dims[2] &&
                  w_dims[1] == dim_t);
 
@@ -91,6 +92,8 @@ bool MatchMatrixTensorOpLite::AttachImpl(const cpp::OpDesc& op_desc,
   param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
   param_.tmp = scope->FindVar(tmp)->GetMutable<lite::Tensor>();
 
+  param_.dim_t = op_desc.GetAttr<int32_t>("dim_t");
+
   return true;
 }
 
diff --git a/lite/operators/merge_lod_tensor_op.cc b/lite/operators/merge_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4258715b1d1aa6bf7fac160dcd6fc8ca6dd3754d
--- /dev/null
+++ b/lite/operators/merge_lod_tensor_op.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/merge_lod_tensor_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool MergeLodTensorOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.mask);
+  CHECK_OR_FALSE(param_.in_true);
+  CHECK_OR_FALSE(param_.in_false);
+  CHECK_OR_FALSE(param_.out);
+
+  const auto mask_dims = param_.mask->dims();
+  CHECK_OR_FALSE(mask_dims.size() == 2);
+  CHECK_OR_FALSE(mask_dims[1] == 1);
+
+  return true;
+}
+
+bool MergeLodTensorOpLite::InferShape() const {
+  auto dims = param_.in_true->dims();
+  param_.out->Resize(dims);
+  return true;
+}
+
+bool MergeLodTensorOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                      lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto mask = op_desc.Input("Mask").front();
+  auto in_true = op_desc.Input("InTrue").front();
+  auto in_false = op_desc.Input("InFalse").front();
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.mask = scope->FindVar(mask)->GetMutable<lite::Tensor>();
+  param_.in_true = scope->FindVar(in_true)->GetMutable<lite::Tensor>();
+  param_.in_false = scope->FindVar(in_false)->GetMutable<lite::Tensor>();
+
+  auto out = op_desc.Output("Out").front();
+  param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  param_.level = op_desc.GetAttr<int>("level");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(merge_lod_tensor,
+                 paddle::lite::operators::MergeLodTensorOpLite);
diff --git a/lite/operators/merge_lod_tensor_op.h b/lite/operators/merge_lod_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..788a3451685cd0f42b72ee01e93e17da49507957
--- /dev/null
+++ b/lite/operators/merge_lod_tensor_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class MergeLodTensorOpLite : public OpLite {
+ public:
+  MergeLodTensorOpLite() {}
+
+  explicit MergeLodTensorOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "merge_lod_tensor"; }
+
+ private:
+  mutable MergeLodTensorParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/mul_op.cc b/lite/operators/mul_op.cc
index 6067be5315220ec8b2f75265982e55f874e4b23a..c870abdc8989b48d8aa2f14f989ad475c027995e 100644
--- a/lite/operators/mul_op.cc
+++ b/lite/operators/mul_op.cc
@@ -32,21 +32,6 @@ bool MulOpLite::CheckShape() const {
   CHECK_GT_OR_FALSE(x_dims.size(), static_cast<size_t>(param_.x_num_col_dims));
   CHECK_GT_OR_FALSE(y_dims.size(), static_cast<size_t>(param_.y_num_col_dims));
 
-  // #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  //   auto x_mat_dims =
-  //       framework::flatten_to_2d(x_dims.data(), param_.x_num_col_dims);
-  //   auto y_mat_dims =
-  //       framework::flatten_to_2d(y_dims.data(), param_.y_num_col_dims);
-
-  //   PADDLE_ENFORCE_EQ(x_mat_dims[1],
-  //                     y_mat_dims[0],
-  //                     "First matrix's width must be equal with second
-  //                     matrix's"
-  //                     "height. %s, %s",
-  //                     x_mat_dims[1],
-  //                     y_mat_dims[0]);
-  // #endif
-
   return true;
 }
 
@@ -73,49 +58,8 @@ bool MulOpLite::InferShape() const {
   return true;
 }
 
-#ifdef LITE_WITH_TRAIN
-bool MulGradOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.y);
-  CHECK_OR_FALSE(param_.output_grad);
-
-  return true;
-}
-
-bool MulGradOpLite::InferShape() const {
-  if (param_.x_grad) param_.x_grad->Resize(param_.x->dims());
-  if (param_.y_grad) param_.y_grad->Resize(param_.y->dims());
-  return true;
-}
-
-bool MulGradOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto X_name = op_desc.Input("X").front();
-  auto Y_name = op_desc.Input("Y").front();
-  auto Out_grad_name = op_desc.Input(framework::GradVarName("Out")).front();
-
-  if (op_desc.Output(framework::GradVarName("X")).size()) {
-    auto X_grad_name = op_desc.Output(framework::GradVarName("X")).front();
-    param_.x_grad = GetMutableVar<lite::Tensor>(scope, X_grad_name);
-  }
-
-  if (op_desc.Output(framework::GradVarName("Y")).size()) {
-    auto Y_grad_name = op_desc.Output(framework::GradVarName("Y")).front();
-    param_.y_grad = GetMutableVar<lite::Tensor>(scope, Y_grad_name);
-  }
-
-  param_.x = GetVar<lite::Tensor>(scope, X_name);
-  param_.y = GetVar<lite::Tensor>(scope, Y_name);
-  param_.output_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
-
-  return true;
-}
-#endif
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_OP(mul, paddle::lite::operators::MulOpLite);
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(mul_grad, paddle::lite::operators::MulGradOpLite);
-#endif
diff --git a/lite/operators/multiclass_nms_op.cc b/lite/operators/multiclass_nms_op.cc
index b9b0db5ccac6ad4561f2bf71ddf5faed98c40a61..9dba5de4f81a1cba8f66132d89f6321ed76d368c 100644
--- a/lite/operators/multiclass_nms_op.cc
+++ b/lite/operators/multiclass_nms_op.cc
@@ -58,6 +58,12 @@ bool MulticlassNmsOpLite::AttachImpl(const cpp::OpDesc& opdesc,
   auto bboxes_name = opdesc.Input("BBoxes").front();
   auto scores_name = opdesc.Input("Scores").front();
   auto out_name = opdesc.Output("Out").front();
+  std::vector<std::string> output_arg_names = opdesc.OutputArgumentNames();
+  if (std::find(output_arg_names.begin(), output_arg_names.end(), "Index") !=
+      output_arg_names.end()) {
+    auto index_name = opdesc.Output("Index").front();
+    param_.index = GetMutableVar<lite::Tensor>(scope, index_name);
+  }
   param_.bboxes = GetVar<lite::Tensor>(scope, bboxes_name);
   param_.scores = GetVar<lite::Tensor>(scope, scores_name);
   param_.out = GetMutableVar<lite::Tensor>(scope, out_name);
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 035b3e18e81bb979dded329361799de2d99aaedb..9aba4a1f3e7b96abedb2f4d835f99072bf4b7f4e 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -69,10 +70,14 @@ struct CalibParam {
   float scale;
 };
 
-struct GraphParam {
-  std::vector<std::pair<std::string, const lite::Tensor*>> inputs{};
-  lite::Tensor* weight{};
-  std::vector<std::pair<std::string, lite::Tensor*>> outputs{};
+struct SubgraphParam {
+  std::vector<std::string> input_names{};
+  std::vector<std::string> output_names{};
+  std::vector<std::string> input_data_names{};
+  std::vector<std::string> output_data_names{};
+  int sub_block_idx{-1};
+  cpp::BlockDesc* sub_block_desc{nullptr};
+  Scope* scope{nullptr};
 };
 
 /// -------------------------- NN operators ------------------------------------
@@ -85,6 +90,7 @@ struct FcParam {
   lite::DDim in_mat_dims;
   int in_num_col_dims{1};
   std::string activation_type{""};
+  bool padding_weights{false};
   // for int8
   WITH_INT8_CONFIG
 };
@@ -253,9 +259,19 @@ struct ConvParam {
   lite::Tensor* residualData{nullptr};
   lite::Tensor* output{};
   std::vector<int> strides{1, 1};
-  std::vector<int> paddings{0, 0};
+  /* paddings type change
+  * from std::vector<int> to std::shared_ptr<std::vector<int>>
+  * to support dynamically modify padding
+  * let kernel param and operator param Synchronous update
+  */
+  std::shared_ptr<std::vector<int>> paddings;
   int groups{1};
-  std::vector<int> dilations{1, 1};
+  /* dilations type change
+  * from std::vector<int> to std::shared_ptr<std::vector<int>>
+  * to support dynamically modify padding
+  * let kernel param and operator param Synchronous update
+  */
+  std::shared_ptr<std::vector<int>> dilations;
   bool fuse_relu_before_depthwise_conv{false};
   bool use_mkldnn{false};
   bool fuse_relu{false};  // only used in mkldnn kernel
@@ -270,6 +286,10 @@ struct ConvParam {
   std::string data_format{"Anylayout"};
   // for activation
   ActivationParam activation_param;
+  // support var_length or not
+  bool var_length{false};
+  // only used in conv_transpose.
+  std::vector<int> output_size;
   // for int8
   WITH_INT8_CONFIG
 };
@@ -302,7 +322,12 @@ struct PoolParam {
   bool global_pooling{
       false};  // if true, knernel size and paddings will be ignored
   std::vector<int> strides{1, 1};
-  std::vector<int> paddings{0, 0};
+  /* paddings type change
+  * from std::vector<int> to std::shared_ptr<std::vector<int>>
+  * to support dynamically modify padding
+  * let kernel param and operator param Synchronous update
+  */
+  std::shared_ptr<std::vector<int>> paddings;
   bool exclusive{true};
   bool adaptive{false};
   bool ceil_mode{false};
@@ -328,6 +353,9 @@ struct DropoutParam {
 struct SplitParam {
   lite::Tensor* x{};
   std::vector<lite::Tensor*> output{};
+  lite::Tensor* axis_tensor;
+  std::vector<lite::Tensor*> sections_tensor_list{};
+
   int axis{-1};
   int num{0};
   std::vector<int> sections;
@@ -389,6 +417,9 @@ struct MeanGradParam {
 struct FillConstantParam {
   int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
   std::vector<int64_t> shape{};
+  lite::Tensor* shape_tensor;
+  std::vector<lite::Tensor*> shape_tensor_list{};
+
   float value{0.0f};
   // useless for x86, keep it for compatibility
   bool force_cpu{false};
@@ -563,6 +594,7 @@ struct MulticlassNmsParam {
   const lite::Tensor* bboxes{};
   const lite::Tensor* scores{};
   lite::Tensor* out{};
+  lite::Tensor* index{};
   int background_label{0};
   float score_threshold{};
   int nms_top_k{};
@@ -740,6 +772,12 @@ struct SequencePoolParam {
 #endif
 };
 
+struct SequencePoolConcatParam {
+  std::vector<lite::Tensor*> X{};
+  lite::Tensor* Out{};
+  std::vector<std::string> pool_type{};
+};
+
 struct SearchGroupPaddingParam {
   lite::Tensor* x{};
   lite::Tensor* out_emb_padding{};
@@ -835,6 +873,8 @@ struct VarConv2DParam {
   int stride_w;
   int kernel_h;
   int kernel_w;
+
+  bool fuse_relu{false};
 };
 
 /// ----------------------- shape operators ----------------------
@@ -1034,6 +1074,66 @@ struct SearchGrnnParam {
   lite::Tensor* layout_input{};
 };
 
+struct SplitLodTensorParam {
+  const lite::Tensor* x{};
+  const lite::Tensor* mask{};
+  lite::Tensor* out_true{};
+  lite::Tensor* out_false{};
+  int level{};
+};
+
+struct MergeLodTensorParam {
+  const lite::Tensor* x{};
+  const lite::Tensor* mask{};
+  const lite::Tensor* in_true{};
+  const lite::Tensor* in_false{};
+  lite::Tensor* out{};
+  int level{};
+};
+
+struct ConditionalBlockParam {
+  const lite::Tensor* cond{};
+  std::vector<lite::Tensor*> x{};
+  std::vector<lite::Tensor*> outs{};
+  cpp::BlockDesc* sub_block{};
+  Scope* scope{};
+  bool is_scalar_condition{};
+};
+
+struct CollectFpnProposalsParam {
+  std::vector<lite::Tensor*> multi_level_rois{};
+  std::vector<lite::Tensor*> multi_level_scores{};
+  lite::Tensor* fpn_rois{};
+  int post_nms_topN{};
+};
+
+struct DistributeFpnProposalsParam {
+  const lite::Tensor* fpn_rois{};
+  std::vector<lite::Tensor*> multi_fpn_rois{};
+  lite::Tensor* restore_index{};
+  int min_level{};
+  int max_level{};
+  int refer_level{};
+  int refer_scale{};
+};
+
+/// --------------------- instance_norm operators --------------------
+struct InstanceNormParam {
+  lite::Tensor* x{};
+  lite::Tensor* out{};
+  lite::Tensor* bias{};
+  lite::Tensor* scale{};
+  lite::Tensor* saved_mean{};
+  lite::Tensor* saved_variance{};
+  float epsilon;
+};
+/// --------------------- grid sampler operators --------------------
+struct GridSamplerParam {
+  lite::Tensor* x{};
+  lite::Tensor* out{};
+  lite::Tensor* grid{};
+};
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/pool_op.cc b/lite/operators/pool_op.cc
index 1ebbc059b76572886f5ff7c8ce1e32b593070fa0..c6f6eed28f8cdb5f080b6f4367a1b88b1dbc0701 100644
--- a/lite/operators/pool_op.cc
+++ b/lite/operators/pool_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/operators/pool_op.h"
+#include <algorithm>
 #include "lite/core/op_registry.h"
 
 namespace paddle {
@@ -26,7 +27,7 @@ bool PoolOpLite::CheckShape() const {
   const auto& x_dims = param_.x->dims();
   const auto& ksize = param_.ksize;
   const auto& strides = param_.strides;
-  const auto& paddings = param_.paddings;
+  const auto& paddings = *param_.paddings;
 
   // "Pooling intput should be 4-D or 5-D tensor."
   CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5);
@@ -34,20 +35,27 @@ bool PoolOpLite::CheckShape() const {
   CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U);
   // Strides size and pooling size should be the same.
   CHECK_OR_FALSE(ksize.size() == strides.size());
-  // Paddings size and pooling size should be the same.
-  CHECK_OR_FALSE(ksize.size() == paddings.size());
+  // Paddings size must be 4.
+  CHECK_OR_FALSE(paddings.size() == 4L);
 
   return true;
 }
 
-int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+int PoolOutputSize(int input_size,
+                   int filter_size,
+                   int pad_left,
+                   int pad_right,
+                   int stride,
+                   bool ceil_mode) {
   int output_size;
   if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
   } else {
     output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
   }
   return output_size;
 }
@@ -55,14 +63,21 @@ int PoolOutputSize(
 bool PoolOpLite::InferShape() const {
   const auto x_dims = param_.x->dims();
   std::vector<int>& ksize = param_.ksize;
+  // dynamic update 4-pad
+  UpdatePadding(param_.paddings.get(),
+                param_.global_pooling,
+                param_.adaptive,
+                padding_algorithm_,
+                x_dims,
+                param_.strides,
+                ksize);
   if (param_.global_pooling) {
     ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
-      param_.paddings[i] = 0;
       ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
-
+  auto paddings = *param_.paddings;
   std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
   if (param_.adaptive) {
     output_shape.insert(
@@ -71,15 +86,14 @@ bool PoolOpLite::InferShape() const {
     for (size_t i = 0; i < param_.ksize.size(); ++i) {
       output_shape.push_back(PoolOutputSize(x_dims[i + 2],
                                             param_.ksize[i],
-                                            param_.paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                             param_.strides[i],
                                             param_.ceil_mode));
     }
   }
   param_.output->Resize(lite::DDim(output_shape));
 
-  // ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  // ctx->ShareLoD("X", "Out");
   return true;
 }
 
diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h
index aecec4c61955cecd67f485662feb1a937681c165..c44875ff95b554ca92cf5288597a5bdaf2cb1bf8 100644
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <algorithm>
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/kernel.h"
@@ -51,7 +53,7 @@ class PoolOpLite : public OpLite {
     param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
     param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
 
     if (op_desc.HasAttr("exclusive")) {
       param_.exclusive = op_desc.GetAttr<bool>("exclusive");
@@ -65,7 +67,23 @@ class PoolOpLite : public OpLite {
     if (op_desc.HasAttr("use_quantizer")) {
       param_.use_quantizer = op_desc.GetAttr<bool>("use_quantizer");
     }
-    // param_.data_format = op_desc.GetAttr<bool>("data_format");
+    if (op_desc.HasAttr("padding_algorithm")) {
+      padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
+    }
+    // 2-pad to 4-pad
+    if (paddings.size() == 2L) {
+      for (size_t i = 0; i < 2L; ++i) {
+        int copy_pad = *(paddings.begin() + 2 * i);
+        paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+      }
+    } else {
+      if (paddings.size() != 4L) {
+        LOG(FATAL)
+            << "Paddings size should be the same or twice as the inputs size.";
+      }
+    }
+    param_.paddings = std::make_shared<std::vector<int>>(paddings);
+
     return true;
   }
 
@@ -75,8 +93,42 @@ class PoolOpLite : public OpLite {
 
  private:
   mutable PoolParam param_;
+  std::string padding_algorithm_{""};
 };
 
+inline void UpdatePadding(std::vector<int> *paddings,
+                          const bool global_pooling,
+                          const bool adaptive,
+                          const std::string padding_algorithm,
+                          const lite::DDim data_dims,
+                          const std::vector<int> &strides,
+                          const std::vector<int> &ksize) {
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
+                   (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+
+  // if global_pooling == true or adaptive == true, padding will be ignore
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/reduce_ops.cc b/lite/operators/reduce_ops.cc
index e986b0ca5412f8380cccc9f981e5e4069ffcdabc..e2cc56b416dd166e6b22a0c642907844ab964cc5 100644
--- a/lite/operators/reduce_ops.cc
+++ b/lite/operators/reduce_ops.cc
@@ -29,39 +29,41 @@ bool ReduceOp::CheckShape() const {
 }
 
 bool ReduceOp::InferShape() const {
-  auto x_dims = param_.x->dims();
+  const auto &x_dims = param_.x->dims();
   auto x_rank = x_dims.size();
   auto dims = param_.dim;
   for (size_t i = 0; i < dims.size(); ++i) {
-    if (dims[i] < 0) dims[i] = x_rank + dims[i];
+    if (dims[i] < 0) {
+      dims[i] = x_rank + dims[i];
+    }
     CHECK_LT(dims[i], x_rank)
         << "The dim should be in the range [-rank(input), rank(input).";
   }
-  sort(dims.begin(), dims.end());
   bool reduce_all = param_.reduce_all;
   bool keep_dim = param_.keep_dim;
 
   if (reduce_all) {
     if (keep_dim)
-      param_.output->Resize(lite::DDim(std::vector<int64_t>(x_rank, 1)));
+      param_.output->Resize(std::vector<int64_t>(x_rank, 1));
     else
-      param_.output->Resize(lite::DDim(std::vector<int64_t>{1}));
+      param_.output->Resize(std::vector<int64_t>{1});
   } else {
-    auto dims_vector = x_dims.Vectorize();
-    if (keep_dim) {
-      for (size_t i = 0; i < dims.size(); ++i) {
-        dims_vector[dims[i]] = 1;
-      }
-    } else {
-      const int kDelFlag = -2;
-      for (size_t i = 0; i < dims.size(); ++i) {
-        dims_vector[dims[i]] = kDelFlag;
+    size_t out_rank = keep_dim ? x_rank : x_rank - dims.size();
+    std::vector<DDim::value_type> out_dims(out_rank);
+    sort(dims.begin(), dims.end());
+    int dim_index = 0;
+    int out_index = 0;
+    for (size_t i = 0; i < x_rank; ++i) {
+      if (dim_index < dims.size() &&
+          dims[dim_index] == static_cast<DDim::value_type>(i)) {
+        if (keep_dim) {
+          out_dims[out_index++] = 1;
+        }
+        dim_index++;
+      } else {
+        out_dims[out_index++] = x_dims[i];
       }
-      dims_vector.erase(
-          remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-          dims_vector.end());
     }
-    auto out_dims = lite::DDim(dims_vector);
     param_.output->Resize(out_dims);
     if (dims[0] != 0) {
       param_.output->set_lod(param_.x->lod());
diff --git a/lite/operators/reduce_prod_op.cc b/lite/operators/reduce_prod_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90da13c8643fa030c376ca25cb3a67b70f3485a4
--- /dev/null
+++ b/lite/operators/reduce_prod_op.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/reduce_prod_op.h"
+#include <algorithm>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ReduceProdOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  return true;
+}
+
+bool ReduceProdOpLite::InferShape() const {
+  auto x = param_.x;
+  auto out = param_.output;
+  std::vector<int> dim = param_.dim;
+  bool reduce_all = param_.reduce_all;
+  bool keep_dim = param_.keep_dim;
+
+  auto x_dims = x->dims();
+  auto x_rank = x_dims.size();
+  CHECK_OR_FALSE(x_rank <= 6U);
+  for (size_t i = 0; i < dim.size(); i++) {
+    if (dim[i] < 0) {
+      dim[i] = x_rank + dim[i];
+    }
+    CHECK_OR_FALSE(static_cast<size_t>(dim[i]) < x_rank);
+  }
+  std::sort(dim.begin(), dim.end());
+
+  if (reduce_all || dim.size() == 0) {
+    if (keep_dim) {
+      out->Resize({static_cast<int64_t>(x_rank), 1});
+    } else {
+      out->Resize({1});
+    }
+  } else {
+    auto dims_vector = x_dims.Vectorize();
+    if (keep_dim) {
+      for (size_t i = 0; i < dim.size(); ++i) {
+        dims_vector[dim[i]] = 1;
+      }
+    } else {
+      const int kDelFlag = -2;
+      for (size_t i = 0; i < dim.size(); ++i) {
+        dims_vector[dim[i]] = kDelFlag;
+      }
+      dims_vector.erase(
+          std::remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+          dims_vector.end());
+    }
+    if (!keep_dim && dims_vector.size() == 0) {
+      dims_vector.push_back(1);
+    }
+    out->Resize(dims_vector);
+    if (dim.size() > 0 && dim[0] != 0) {
+      out->set_lod(x->lod());
+    }
+  }
+  return true;
+}
+
+bool ReduceProdOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                  lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+
+  auto output = op_desc.Output("Out").front();
+  param_.output = scope->FindVar(output)->GetMutable<lite::Tensor>();
+
+  param_.dim = op_desc.GetAttr<std::vector<int>>("dim");
+  param_.keep_dim = op_desc.GetAttr<bool>("keep_dim");
+  param_.reduce_all = op_desc.GetAttr<bool>("reduce_all");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(reduce_prod, paddle::lite::operators::ReduceProdOpLite);
diff --git a/lite/operators/reduce_prod_op.h b/lite/operators/reduce_prod_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f7a6dcdf98eb99d9145b7e3108972f4debeaeb5
--- /dev/null
+++ b/lite/operators/reduce_prod_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ReduceProdOpLite : public OpLite {
+ public:
+  ReduceProdOpLite() {}
+
+  explicit ReduceProdOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "reduce_prod"; }
+
+ private:
+  mutable ReduceParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/reshape_op.cc b/lite/operators/reshape_op.cc
index 35f38148591690b4a1e563327112807ad12e6e2b..655ac58bdcbfc0f8d9cdbb0ef0078db5eb0333fa 100644
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
@@ -27,14 +27,15 @@ bool ReshapeOp::CheckShape() const {
 }
 
 bool ReshapeOp::InferShape() const {
-  auto shape_tensor_vct = param_.shape_tensor_vct;
+  const auto &shape_tensor_vct = param_.shape_tensor_vct;
   auto *shape_tensor = param_.shape_tensor;
-  auto shape_vct = param_.shape_vct;
-  std::vector<int> final_shape;
+  const auto &shape_vct = param_.shape_vct;
 
+  std::vector<int> final_shape;
   if (shape_tensor_vct.size() > 0) {
-    for (int i = 0; i < shape_tensor_vct.size(); i++) {
-      final_shape.push_back(shape_tensor_vct[i]->data<int>()[0]);
+    final_shape.resize(shape_tensor_vct.size());
+    for (size_t i = 0; i < shape_tensor_vct.size(); i++) {
+      final_shape[i] = shape_tensor_vct[i]->data<int>()[0];
     }
   } else if (shape_tensor != nullptr) {
     auto *shape_tensor_data = shape_tensor->data<int>();
@@ -46,7 +47,7 @@ bool ReshapeOp::InferShape() const {
     LOG(FATAL) << "input shape error";
   }
 
-  auto x_dims = param_.x->dims();
+  const auto &x_dims = param_.x->dims();
   auto output_dims = ValidateShape(final_shape, x_dims);
   param_.output->Resize(output_dims);
   auto out_lod = param_.output->mutable_lod();
@@ -98,8 +99,9 @@ bool Reshape2Op::CheckShape() const {
 
 bool Reshape2Op::InferShape() const {
   ReshapeOp::InferShape();
-  auto x_dims = param_.x->dims();
-  std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 0);
+  const auto &x_dims = param_.x->dims();
+  std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
   for (size_t i = 0; i < x_dims.size(); i++) {
     xshape_dims[i + 1] = x_dims[i];
   }
@@ -116,20 +118,26 @@ bool Reshape2Op::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   return true;
 }
 
-DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims) {
-  const lite::DDim::value_type input_size = input_dims.production();
-  auto input_shape = input_dims.Vectorize();
-  bool all_positive = std::all_of(
-      input_shape.cbegin(), input_shape.cend(), [](lite::DDim::value_type i) {
-        return i > 0;
-      });
+static bool CheckPositive(const DDim &dims) {
+  for (size_t i = 0; i < dims.size(); ++i) {
+    if (dims[i] <= 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<DDim::value_type> ValidateShape(const std::vector<int> &shape,
+                                            const DDim &input_dims) {
+  const DDim::value_type input_size = input_dims.production();
+
   // only one dimension can be set to -1, whose size will be automatically
   // infered.
   const int unk_dim_val = -1;
   const int copy_dim_val = 0;
 
-  std::vector<lite::DDim::value_type> output_shape(shape.size(), 0);
-  lite::DDim::value_type capacity = 1;
+  std::vector<DDim::value_type> output_dims(shape.size());
+  DDim::value_type capacity = 1;
   int unk_dim_idx = -1;
   for (size_t i = 0; i < shape.size(); ++i) {
     if (shape[i] == unk_dim_val) {
@@ -137,7 +145,7 @@ DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims) {
           << "Only one input dimension of Attr(shape) can be unknown.";
       unk_dim_idx = i;
     } else if (shape[i] == copy_dim_val) {
-      CHECK_LT(static_cast<int>(i), input_shape.size())
+      CHECK_LT(static_cast<int>(i), input_dims.size())
           << "The index of dimension to copy from input shape must be less "
              "than the size of input shape.";
     } else {
@@ -145,28 +153,28 @@ DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims) {
                                "be negtive except one unknown dimension.";
     }
 
-    capacity *= (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
-                          : input_shape[i]);
-    output_shape[i] = (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
-                                : input_shape[i]);
+    DDim::value_type output_dim_i =
+        shape[i] ? static_cast<DDim::value_type>(shape[i]) : input_dims[i];
+    output_dims[i] = output_dim_i;
+    capacity *= output_dim_i;
   }
 
   if (unk_dim_idx != -1) {
-    if (all_positive) {
+    if (CheckPositive(input_dims)) {
       // input_size < 0 and is un-determinate in compile time, skip the check,
       // for example, input_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
       // capacity = -24, input_size = -8, output_shape[0] = 0
       // the following check will fail.
-      output_shape[unk_dim_idx] = -input_size / capacity;
-      CHECK_EQ(output_shape[unk_dim_idx] * capacity, -input_size)
+      output_dims[unk_dim_idx] = -input_size / capacity;
+      CHECK_EQ(output_dims[unk_dim_idx] * capacity, -input_size)
           << "Invalid shape is given.";
     } else {
-      output_shape[unk_dim_idx] = -1;
+      output_dims[unk_dim_idx] = -1;
     }
   } else {
     CHECK_EQ(capacity, input_size) << "Invalid shape is given.";
   }
-  return lite::DDim(output_shape);
+  return output_dims;
 }
 
 }  // namespace operators
diff --git a/lite/operators/reshape_op.h b/lite/operators/reshape_op.h
index bd31f7f73feb16c40138a95c961f89fc777b80cb..1df49fb5f44c88978b78f17885a5ba4412aa9ab7 100644
--- a/lite/operators/reshape_op.h
+++ b/lite/operators/reshape_op.h
@@ -56,7 +56,8 @@ class Reshape2Op : public ReshapeOp {
   std::string DebugString() const override { return "reshape2"; }
 };
 
-DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims);
+std::vector<DDim::value_type> ValidateShape(const std::vector<int> &shape,
+                                            const DDim &input_dims);
 
 }  // namespace operators
 }  // namespace lite
diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc
index 50d09f602b1e42366ad598c3805c9d5726d2ab78..2e77e361624e681aa93e36610674df0e1f9a13af 100644
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
@@ -77,4 +77,4 @@ bool SearchFcOpLite::AttachImpl(const cpp::OpDesc &op_desc,
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_OP(SearchFc, paddle::lite::operators::SearchFcOpLite);
+REGISTER_LITE_OP(search_fc, paddle::lite::operators::SearchFcOpLite);
diff --git a/lite/operators/search_group_padding_op.cc b/lite/operators/search_group_padding_op.cc
index 2556468100bc75add8ab75b422371602283157a8..5ba4dde275f4b9662416bdf5190cacfafc56a40d 100644
--- a/lite/operators/search_group_padding_op.cc
+++ b/lite/operators/search_group_padding_op.cc
@@ -43,9 +43,9 @@ bool SearchGroupPaddingOp::InferShape() const {
 bool SearchGroupPaddingOp::AttachImpl(const cpp::OpDesc &op_desc,
                                       lite::Scope *scope) {
   auto x = op_desc.Input("X").front();
-  auto out_emb_padding = op_desc.Input("Out_emb_padding").front();
-  auto out_new = op_desc.Input("Out_new").front();
-  auto out_padding = op_desc.Input("Out_padding").front();
+  auto out_emb_padding = op_desc.Output("Out_emb_padding").front();
+  auto out_new = op_desc.Output("Out_new").front();
+  auto out_padding = op_desc.Output("Out_padding").front();
 
   param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
   param_.out_emb_padding =
diff --git a/lite/operators/sequence_arithmetic_op.cc b/lite/operators/sequence_arithmetic_op.cc
index 6c4a28f8a8d6d014eda115a5d475ff295c846c3b..29c39ebc23f54c2c3c052e322575d97570195cfc 100644
--- a/lite/operators/sequence_arithmetic_op.cc
+++ b/lite/operators/sequence_arithmetic_op.cc
@@ -38,7 +38,7 @@ bool SequenceArithmeticOp::AttachImpl(const cpp::OpDesc &opdesc,
                                       lite::Scope *scope) {
   param_.X = scope->FindTensor(opdesc.Input("X").front());
   param_.Y = scope->FindTensor(opdesc.Input("Y").front());
-  param_.Out = scope->FindMutableTensor(opdesc.Input("Out").front());
+  param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front());
 
   param_.op_type = opdesc.GetAttr<int>("op_type");
 
diff --git a/lite/operators/sequence_concat_op.cc b/lite/operators/sequence_concat_op.cc
index 7c842d49e54a6a567abd4b733307942f90176dce..88afe5e00fe2bfc173a8a1d1d0e63562cfb52518 100644
--- a/lite/operators/sequence_concat_op.cc
+++ b/lite/operators/sequence_concat_op.cc
@@ -23,47 +23,10 @@ bool SequenceConcatOp::CheckShape() const {
   CHECK_GT(param_.X.size(), 1)
       << "The number of input sequences is at least two.";
   CHECK_OR_FALSE(param_.Out);
-  size_t lod_size = 0;
-  for (const auto &t : param_.X) {
-    CHECK_EQ(t->lod().empty(), false)
-        << "Input Tensor of X does not contain LoD information.";
-    CHECK_EQ(t->lod().size(), 1) << "Only support one level sequence now.";
-    if (lod_size == 0) {
-      lod_size = t->lod()[0].size();
-    } else {
-      CHECK_EQ(t->lod()[0].size(), lod_size)
-          << "The number of sequence must be same between each input";
-    }
-  }
-  CHECK_NE(lod_size, 0) << "Each input must have sequence information";
   return true;
 }
 
-bool SequenceConcatOp::InferShape() const {
-  int64_t batch_size = 0;
-  int64_t feature_size = 0;
-  std::vector<int64_t> out_dims;
-  for (const auto &tensor : param_.X) {
-    const auto x_dims = tensor->dims();
-    if (out_dims.empty()) {
-      out_dims = x_dims.Vectorize();
-    }
-    batch_size += x_dims[0];
-    if (feature_size == 0) {
-      feature_size = x_dims.production() / x_dims[0];
-    } else {
-      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
-          << "Inputs of sequence concat must have same feature size";
-    }
-  }
-  if (batch_size < 0) {
-    batch_size = -1;  // Normalize batch size for compile time.
-  }
-  out_dims[0] = batch_size;
-  param_.Out->Resize(out_dims);
-  // LoD info will be computed in Kernel.
-  return true;
-}
+bool SequenceConcatOp::InferShape() const { return true; }
 
 bool SequenceConcatOp::AttachImpl(const cpp::OpDesc &opdesc,
                                   lite::Scope *scope) {
diff --git a/lite/operators/sequence_pool_concat_op.cc b/lite/operators/sequence_pool_concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ee0d4d5967e0d36bb893b42033f2c5319c940bb
--- /dev/null
+++ b/lite/operators/sequence_pool_concat_op.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_pool_concat_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequencePoolConcatOp::CheckShape() const {
+  CHECK_GE(param_.X.size(), 1)
+      << "The number of input sequences is at least two.";
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool SequencePoolConcatOp::InferShape() const {
+  int out_dim = 0;
+  for (int i = 0; i < param_.X.size(); ++i) {
+    out_dim += param_.X[i]->dims().count(1, param_.X[i]->dims().size());
+  }
+  int seq_num = param_.X[0]->lod()[0].size() - 1;
+
+  std::vector<std::vector<uint64_t>> lod(1);
+  for (int i = 0; i < seq_num + 1; ++i) {
+    lod[0].push_back(i);
+  }
+  param_.Out->set_lod(lod);
+  param_.Out->Resize({seq_num, out_dim});
+  return true;
+}
+
+bool SequencePoolConcatOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                      lite::Scope *scope) {
+  auto input_list = opdesc.Input("X");
+  param_.X.clear();
+  for (auto var : input_list) {
+    param_.X.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  }
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  CHECK(param_.Out) << "Output(Out) of Sequence Concat Op should not be null.";
+  param_.pool_type = opdesc.GetAttr<std::vector<std::string>>("pooltype");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_pool_concat,
+                 paddle::lite::operators::SequencePoolConcatOp);
diff --git a/lite/operators/sequence_pool_concat_op.h b/lite/operators/sequence_pool_concat_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a70ceaf298ebd7d02c319b08a86f40dc36cb648
--- /dev/null
+++ b/lite/operators/sequence_pool_concat_op.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequencePoolConcatOp : public OpLite {
+ public:
+  SequencePoolConcatOp() {}
+  explicit SequencePoolConcatOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_pool_concat"; }
+
+ private:
+  mutable SequencePoolConcatParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_topk_avg_pooling_op.cc b/lite/operators/sequence_topk_avg_pooling_op.cc
index 384d13711285566bf99fcc43b81e5e81d86dc35e..6f5cbeeeee5816132d2ebcb7094949189931b931 100644
--- a/lite/operators/sequence_topk_avg_pooling_op.cc
+++ b/lite/operators/sequence_topk_avg_pooling_op.cc
@@ -54,8 +54,7 @@ bool SequenceTopkAvgPoolingOpLite::InferShape() const {
   vec_out_shape.push_back(channel_num * num_k);
 
   param_.Out->Resize(lite::DDim(vec_out_shape));
-  auto out_lod = param_.Out->mutable_lod();
-  *out_lod = param_.X->lod();
+  param_.Out->set_lod(param_.ROW->lod());
   return true;
 }
 
@@ -82,5 +81,5 @@ bool SequenceTopkAvgPoolingOpLite::AttachImpl(const cpp::OpDesc &op_desc,
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_OP(SequenceTopkAvgPooling,
+REGISTER_LITE_OP(sequence_topk_avg_pooling,
                  paddle::lite::operators::SequenceTopkAvgPoolingOpLite);
diff --git a/lite/operators/slice_op.cc b/lite/operators/slice_op.cc
index 43d0b063069b3393f39ce02dc938c77ae98054a1..bbc3d1429e202dac7b9a53c00d83ee34de7ef3d1 100644
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
@@ -83,7 +83,6 @@ bool SliceOp::InferShape() const {
   if (axes[0] != 0) {
     param_.Out->set_lod(param_.X->lod());
   }
-  LOG(INFO) << "infer shape done";
   return true;
 }
 
@@ -162,7 +161,6 @@ bool SliceOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
     CHECK_EQ(ends_size, param_.axes.size())
         << "The size of ends must be equal to the size of axes.";
   }
-  LOG(INFO) << "attach impl done";
   return true;
 }
 
diff --git a/lite/operators/split_lod_tensor_op.cc b/lite/operators/split_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b665b6026a44caa31b89ec7806188f90f5f1595
--- /dev/null
+++ b/lite/operators/split_lod_tensor_op.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/split_lod_tensor_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SplitLodTensorOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.mask);
+  CHECK_OR_FALSE(param_.out_true);
+  CHECK_OR_FALSE(param_.out_false);
+
+  const auto mask_dims = param_.mask->dims();
+  CHECK_OR_FALSE(mask_dims.size() == 2);
+  CHECK_OR_FALSE(mask_dims[1] == 1);
+
+  return true;
+}
+
+bool SplitLodTensorOpLite::InferShape() const {
+  auto x_dims = param_.x->dims();
+  param_.out_true->Resize(x_dims);
+  param_.out_false->Resize(x_dims);
+  return true;
+}
+
+bool SplitLodTensorOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                      lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto mask = op_desc.Input("Mask").front();
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.mask = scope->FindVar(mask)->GetMutable<lite::Tensor>();
+
+  auto out_true = op_desc.Output("OutTrue").front();
+  auto out_false = op_desc.Output("OutFalse").front();
+  param_.out_true = scope->FindVar(out_true)->GetMutable<lite::Tensor>();
+  param_.out_false = scope->FindVar(out_false)->GetMutable<lite::Tensor>();
+
+  param_.level = op_desc.GetAttr<int>("level");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(split_lod_tensor,
+                 paddle::lite::operators::SplitLodTensorOpLite);
diff --git a/lite/operators/split_lod_tensor_op.h b/lite/operators/split_lod_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7feef4f85df652d0c24f830076a078e20c111f9
--- /dev/null
+++ b/lite/operators/split_lod_tensor_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SplitLodTensorOpLite : public OpLite {
+ public:
+  SplitLodTensorOpLite() {}
+
+  explicit SplitLodTensorOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "split_lod_tensor"; }
+
+ private:
+  mutable SplitLodTensorParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc
index 18280616aa00b734596b620727f6dcfd5beb67d7..834d68a3156700605e621a1ba71faec33fb7b745 100644
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
@@ -39,8 +39,16 @@ bool SplitOp::InferShape() const {
   const int outs_number = outs.size();
   std::vector<lite::DDim> outs_dims;
   outs_dims.reserve(outs_number);
-
-  if (num > 0) {
+  std::vector<lite::Tensor *> sections_tensor_list_ =
+      param_.sections_tensor_list;
+  if (sections.size() > 0 && sections_tensor_list_.size() > 0) {
+    std::vector<int> vec_sections;
+    for (size_t i = 0; i < sections_tensor_list_.size(); ++i) {
+      auto dim = in_dims;
+      dim[axis] = sections_tensor_list_[i]->data<int>()[0];
+      outs_dims.push_back(dim);
+    }
+  } else if (num > 0) {
     int out_axis_dim = in_dims[axis] / num;
     for (int i = 0; i < outs_number; ++i) {
       auto dim = in_dims;
@@ -55,6 +63,10 @@ bool SplitOp::InferShape() const {
     }
   }
 
+  if (param_.axis_tensor != nullptr) {
+    axis = param_.axis_tensor->data<int>()[0];
+  }
+
   for (int j = 0; j < outs_dims.size(); ++j) {
     outs[j]->Resize(outs_dims[j]);
   }
@@ -73,6 +85,25 @@ bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   for (auto var : outs) {
     param_.output.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
   }
+  std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
+  if (std::find(input_arg_names.begin(), input_arg_names.end(), "AxisTensor") !=
+      input_arg_names.end()) {
+    auto args = opdesc.Input("AxisTensor");
+    if (!args.empty()) {
+      auto *var = scope->FindVar(args.front());
+      param_.axis_tensor = var->GetMutable<lite::Tensor>();
+    }
+  }
+  if (std::find(input_arg_names.begin(),
+                input_arg_names.end(),
+                "SectionsTensorList") != input_arg_names.end()) {
+    auto args = opdesc.Input("SectionsTensorList");
+    if (!args.empty()) {
+      auto *var = scope->FindVar(args.front());
+      param_.sections_tensor_list =
+          *(var->GetMutable<std::vector<lite::Tensor *>>());
+    }
+  }
   return true;
 }
 
diff --git a/lite/operators/subgraph_op.cc b/lite/operators/subgraph_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58388669afa060d48ea4c3d674dff94c386f104a
--- /dev/null
+++ b/lite/operators/subgraph_op.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/subgraph_op.h"
+#include <utility>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SubgraphOp::CheckShape() const { return true; }
+
+bool SubgraphOp::InferShape() const { return CheckShape(); /* enrich me */ }
+
+bool SubgraphOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  param_.input_names = op_desc.Input("Inputs");
+  param_.output_names = op_desc.Output("Outputs");
+  for (auto& input_name : param_.input_names) {
+    CHECK(scope->FindVar(input_name));
+    scope->FindVar(input_name)->GetMutable<lite::Tensor>();
+  }
+  for (auto& output_name : param_.output_names) {
+    CHECK(scope->FindVar(output_name));
+    scope->FindVar(output_name)->GetMutable<lite::Tensor>();
+  }
+  param_.input_data_names =
+      op_desc.GetAttr<std::vector<std::string>>("input_data_names");
+  param_.output_data_names =
+      op_desc.GetAttr<std::vector<std::string>>("output_data_names");
+  CHECK(param_.sub_block_desc);
+  param_.sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  param_.scope = scope;
+  CHECK(param_.scope);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(subgraph, paddle::lite::operators::SubgraphOp);
diff --git a/lite/operators/graph_op.h b/lite/operators/subgraph_op.h
similarity index 77%
rename from lite/operators/graph_op.h
rename to lite/operators/subgraph_op.h
index 20a7cd9b8da9a6d4e01411f9cff9e9a3aabc6ff7..7f593159c8651cc18fbea17e559f62297d5022e9 100644
--- a/lite/operators/graph_op.h
+++ b/lite/operators/subgraph_op.h
@@ -27,11 +27,11 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-class GraphOpLite : public OpLite {
+class SubgraphOp : public OpLite {
  public:
-  GraphOpLite() {}
+  SubgraphOp() {}
 
-  explicit GraphOpLite(const std::string &type) : OpLite(type) {}
+  explicit SubgraphOp(const std::string &type) : OpLite(type) {}
 
   bool CheckShape() const override;
 
@@ -41,10 +41,13 @@ class GraphOpLite : public OpLite {
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
-  std::string DebugString() const override { return "graph_op"; }
+  std::string DebugString() const override { return "subgraph"; }
+
+  void SetSubBlock(cpp::BlockDesc *desc) { param_.sub_block_desc = desc; }
+  cpp::BlockDesc *GetSubBlock() { return param_.sub_block_desc; }
 
  private:
-  mutable GraphParam param_;
+  mutable SubgraphParam param_;
 };
 
 }  // namespace operators
diff --git a/lite/operators/transpose_op.cc b/lite/operators/transpose_op.cc
index ce850be5334d596104cf545dc82abd44c62c88cc..71086b492b538e293a1f08ed7f492a46d6eb02f8 100644
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
@@ -135,6 +135,15 @@ bool Transpose2Op::InferShape() const {
     out_dims[i] = x_dims[axis[i]];
   }
   param_.output->Resize(out_dims);
+
+  std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 0);
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  param_.xshape->Resize(xshape_dims);
+  auto xshape_lod = param_.xshape->mutable_lod();
+  *xshape_lod = param_.x->lod();
+
   return true;
 }
 
diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc
index 5c7fe374fc90b20ee44df3d1619f44109b7387c0..51f43c709990d7ac1e664336e252ed684479b783 100644
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
@@ -19,28 +19,7 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-bool VarConv2dOp::CheckShape() const {
-  auto x_dims = param_.X->dims();
-  CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) can't be less than 2.";
-  auto w_dims = param_.W->dims();
-  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor";
-  CHECK_EQ(w_dims[0], param_.output_channel)
-      << "W dim[0] should be equal to OutputChannel";
-  CHECK_EQ(w_dims[1], param_.input_channel * param_.kernel_h * param_.kernel_w)
-      << "W dim[1] should be equal to InputChannel * KernelH * KernelW";
-  LoD x_lod = param_.X->lod();
-  CHECK_EQ(x_lod.empty(), false) << "The Input(X) must hold lod info.";
-  // CHECK_GE(x_lod.size(), 1) << "The Input(X)'s lod info is corrupted.";
-  CHECK_GE(x_lod.size(), 3) << "The Input(X)'s lod info is corrupted.";
-  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod[0].back()))
-      << "The Input(X)'s lod info mismatches the actual tensor shape.";
-  // LoD row_lod = param_.ROW->lod();
-  // CHECK_EQ(row_lod.empty(), false) << "The Input(ROW) must hold lod info.";
-  // LoD col_lod = param_.COLUMN->lod();
-  // CHECK_EQ(col_lod.empty(), false) << "The Input(COLUMN) must hold lod
-  // info.";
-  return true;
-}
+bool VarConv2dOp::CheckShape() const { return true; }
 
 bool VarConv2dOp::InferShape() const { return true; }
 
@@ -69,6 +48,10 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.kernel_w = opdesc.GetAttr<int>("KernelW");
   param_.stride_h = opdesc.GetAttr<int>("StrideH");
   param_.stride_w = opdesc.GetAttr<int>("StrideW");
+
+  if (opdesc.HasAttr("fuse_relu")) {
+    param_.fuse_relu = opdesc.GetAttr<bool>("fuse_relu");
+  }
   return true;
 }
 
diff --git a/lite/tests/cv/CMakeLists.txt b/lite/tests/cv/CMakeLists.txt
index 05fcc06b10ae5dc6b009ae087ce4e18f8d82e475..697c9874ef2072eedf6b654863e25e981fb6834a 100644
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
@@ -1,3 +1,3 @@
 if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
-    lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm paddle_api_light ${lite_cv_deps} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
 endif()
diff --git a/lite/tests/cv/cv_basic.h b/lite/tests/cv/cv_basic.h
index 728d3167144bc6e03683b77803fb4887967eb524..92f68543bb15bdc15a8ed029f67ed33ca215361b 100644
--- a/lite/tests/cv/cv_basic.h
+++ b/lite/tests/cv/cv_basic.h
@@ -192,7 +192,6 @@ void nv21_bgra_basic(const uint8_t* in_data,
   nv2bgra(in_data, out_data, srcw, srch, 0, 1);
 }
 
-/*
 /*
 采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R
 采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B
@@ -217,6 +216,21 @@ void bgr_gray_basic(const uint8_t* in_data,
     }
   }
 }
+void bgra_gray_basic(const uint8_t* in_data,
+                     uint8_t* out_data,
+                     int srcw,
+                     int srch) {
+  for (int i = 0; i < srch; i++) {
+    const uint8_t* din_ptr = in_data + i * 4 * srcw;
+    uint8_t* dout_ptr = out_data + i * srcw;
+    for (int j = 0; j < srcw; j++) {
+      int sum = din_ptr[0] * 15 + din_ptr[1] * 75 + din_ptr[2] * 38;
+      sum = sum >> 7;
+      *dout_ptr++ = sum;
+      din_ptr += 4;
+    }
+  }
+}
 
 void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   for (int i = 0; i < srch; i++) {
@@ -228,6 +242,17 @@ void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
     }
   }
 }
+void gray_bgra_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  for (int i = 0; i < srch; i++) {
+    for (int j = 0; j < srcw; j++) {
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = 255;
+      src++;
+    }
+  }
+}
 // bgr2bgra, rgb2rgba
 void hwc3_to_hwc4_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   for (int i = 0; i < srch; i++) {
@@ -340,6 +365,16 @@ void image_convert_basic(const uint8_t* in_data,
                (srcFormat == ImageFormat::GRAY &&
                 dstFormat == ImageFormat::BGR)) {
       gray_bgr_basic(in_data, out_data, srcw, srch);
+    } else if ((srcFormat == ImageFormat::RGBA &&
+                dstFormat == ImageFormat::GRAY) ||
+               (srcFormat == ImageFormat::BGRA &&
+                dstFormat == ImageFormat::GRAY)) {
+      bgra_gray_basic(in_data, out_data, srcw, srch);
+    } else if ((srcFormat == ImageFormat::GRAY &&
+                dstFormat == ImageFormat::RGBA) ||
+               (srcFormat == ImageFormat::GRAY &&
+                dstFormat == ImageFormat::BGRA)) {
+      gray_bgra_basic(in_data, out_data, srcw, srch);
     } else if ((srcFormat == ImageFormat::RGBA &&
                 dstFormat == ImageFormat::RGB) ||
                (srcFormat == ImageFormat::BGRA &&
@@ -525,6 +560,7 @@ void image_resize_basic(const uint8_t* in_data,
     int y_flag = 0;  // only one line
     if (y_in_start < 0) {
       y_flag = 1;
+      y_in_end = 0;
     }
     float b0 = ibeta[dy * 2];
     float b1 = ibeta[dy * 2 + 1];
@@ -750,6 +786,26 @@ void image_flip_basic(const uint8_t* in_data,
     flipxy_basic(in_data, srch, srcw, out_data, num);
   }
 }
+void gray_to_tensor_basic(const uint8_t* bgr,
+                          float* output,
+                          int width,
+                          int height,
+                          float* means,
+                          float* scales,
+                          int num) {
+  int size = width * height;
+  float mean_val = means[0];
+  float scale_val = scales[0];
+
+  for (int h = 0; h < height; h++) {
+    const uint8_t* ptr_bgr = bgr + h * width * num;
+    float* ptr_h = output + h * width;
+    for (int i = 0; i < width; i++) {
+      *ptr_h++ = (ptr_bgr[0] - mean_val) * scale_val;
+      ptr_bgr += num;
+    }
+  }
+}
 
 void bgr_to_tensor_chw_basic(const uint8_t* bgr,
                              float* output,
@@ -828,5 +884,8 @@ void image_to_tensor_basic(const uint8_t* in_data,
   } else if (layout == LayoutType::kNHWC && (srcFormat == ImageFormat::BGRA ||
                                              srcFormat == ImageFormat::RGBA)) {
     bgr_to_tensor_hwc_basic(in_data, output, srcw, srch, means, scales, 4);
+  } else if (srcFormat == ImageFormat::GRAY &&
+             (layout == LayoutType::kNHWC || layout == LayoutType::kNCHW)) {
+    gray_to_tensor_basic(in_data, output, srcw, srch, means, scales, 1);
   }
 }
diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc
index 7c0f867fae4bca1957ba1610db5f40b8c8dbabdf..e22e327e8b10d1237f5e07b5b0a8d95d3b19e70b 100644
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
@@ -17,9 +17,10 @@
 #include <math.h>
 #include <random>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/tests/cv/cv_basic.h"
-#include "lite/tests/utils/timer.h"
 #include "lite/utils/cv/paddle_image_preprocess.h"
+#include "time.h"  // NOLINT
 
 DEFINE_int32(cluster, 3, "cluster id");
 DEFINE_int32(threads, 1, "threads num");
@@ -28,15 +29,15 @@ DEFINE_int32(repeats, 1, "repeats times");
 DEFINE_bool(basic_test, false, "do all tests");
 DEFINE_bool(check_result, true, "check the result");
 
-DEFINE_int32(srcFormat, 0, "input image format");
-DEFINE_int32(dstFormat, 1, "output image format");
+DEFINE_int32(srcFormat, 0, "input image format RGBA");
+DEFINE_int32(dstFormat, 2, "output image format RGB");
 DEFINE_int32(srch, 1920, "input height");
 DEFINE_int32(srcw, 1080, "input width");
 DEFINE_int32(dsth, 960, "output height");
 DEFINE_int32(dstw, 540, "output width");
 DEFINE_int32(angle, 90, "rotate angel");
 DEFINE_int32(flip_num, 0, "flip x");
-DEFINE_int32(layout, 0, "layout nchw");
+DEFINE_int32(layout, 1, "layout nchw");
 
 typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
 typedef paddle::lite::utils::cv::FlipParam FlipParam;
@@ -46,7 +47,7 @@ typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
 typedef paddle::lite_api::Tensor Tensor_api;
 typedef paddle::lite::Tensor Tensor;
 
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 void fill_tensor_host_rand(uint8_t* dio, int64_t size) {
   uint seed = 256;
@@ -99,7 +100,7 @@ void test_img(const std::vector<int>& cluster_id,
               float rotate,
               FlipParam flip,
               LayoutType layout,
-              int test_iter = 1) {
+              int test_iter = 10) {
 #ifdef LITE_WITH_ARM
   paddle::lite::DeviceInfo::Init();
 #endif
@@ -221,7 +222,7 @@ void test_img(const std::vector<int>& cluster_id,
       float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f};
 
       if (FLAGS_check_result) {
-        LOG(INFO) << "image convert basic compute";
+        // LOG(INFO) << "image convert basic compute";
         image_convert_basic(src,
                             basic_dst,
                             (ImageFormat)srcFormat,
@@ -230,7 +231,7 @@ void test_img(const std::vector<int>& cluster_id,
                             srch,
                             out_size);
 
-        LOG(INFO) << "image resize basic compute";
+        // LOG(INFO) << "image resize basic compute";
         image_resize_basic(basic_dst,
                            resize_basic,
                            (ImageFormat)dstFormat,
@@ -239,7 +240,7 @@ void test_img(const std::vector<int>& cluster_id,
                            dstw,
                            dsth);
 
-        LOG(INFO) << "image rotate basic compute";
+        // LOG(INFO) << "image rotate basic compute";
         image_rotate_basic(resize_basic,
                            tv_out_ratote_basic,
                            (ImageFormat)dstFormat,
@@ -247,7 +248,7 @@ void test_img(const std::vector<int>& cluster_id,
                            dsth,
                            rotate);
 
-        LOG(INFO) << "image flip basic compute";
+        // LOG(INFO) << "image flip basic compute";
         image_flip_basic(resize_basic,
                          tv_out_flip_basic,
                          (ImageFormat)dstFormat,
@@ -255,7 +256,7 @@ void test_img(const std::vector<int>& cluster_id,
                          dsth,
                          flip);
 
-        LOG(INFO) << "image to tensor basic compute";
+        // LOG(INFO) << "image to tensor basic compute";
         image_to_tensor_basic(resize_basic,
                               &tensor_basic,
                               (ImageFormat)dstFormat,
@@ -267,10 +268,13 @@ void test_img(const std::vector<int>& cluster_id,
       }
 
       Timer t1;
+      Timer t_convert;
+      Timer t_resize;
+      Timer t_flip;
+      Timer t_rotate;
+      Timer t_tensor;
 
       LOG(INFO) << "saber cv compute";
-      double to = 0;
-      double min_time = 100000;
       TransParam tparam;
       tparam.ih = srch;
       tparam.iw = srcw;
@@ -285,15 +289,17 @@ void test_img(const std::vector<int>& cluster_id,
       ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
 
       for (int i = 0; i < test_iter; ++i) {
-        t1.clear();
-        t1.start();
+        t1.Start();
 
-        LOG(INFO) << "image convert saber compute";
+        // LOG(INFO) << "image convert saber compute";
+        t_convert.Start();
         // 方法一: image_preprocess.imageCovert(src, lite_dst);
-        image_preprocess.imageCovert(
+        image_preprocess.imageConvert(
             src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat);
+        t_convert.Stop();
 
-        LOG(INFO) << "image resize saber compute";
+        // LOG(INFO) << "image resize saber compute";
+        t_resize.Start();
         // 方法一:image_preprocess.imageResize(lite_dst, resize_tmp);
         image_preprocess.imageResize(lite_dst,
                                      resize_tmp,
@@ -302,8 +308,10 @@ void test_img(const std::vector<int>& cluster_id,
                                      srch,
                                      dstw,
                                      dsth);
+        t_resize.Stop();
 
-        LOG(INFO) << "image rotate saber compute";
+        // LOG(INFO) << "image rotate saber compute";
+        t_rotate.Start();
         // 方法一: image_preprocess.imageRotate(resize_tmp, tv_out_ratote);
         image_preprocess.imageRotate(resize_tmp,
                                      tv_out_ratote,
@@ -311,13 +319,17 @@ void test_img(const std::vector<int>& cluster_id,
                                      dstw,
                                      dsth,
                                      rotate);
+        t_rotate.Stop();
 
-        LOG(INFO) << "image flip saber compute";
+        // LOG(INFO) << "image flip saber compute";
+        t_flip.Start();
         // 方法一: image_preprocess.imageFlip(resize_tmp, tv_out_flip);
         image_preprocess.imageFlip(
             resize_tmp, tv_out_flip, (ImageFormat)dstFormat, dstw, dsth, flip);
+        t_flip.Stop();
 
-        LOG(INFO) << "image to tensor compute";
+        // LOG(INFO) << "image to tensor compute";
+        t_tensor.Start();
         // 方法一: image_preprocess.image2Tensor(
         //  resize_tmp, &dst_tensor, layout, means, scales);
         image_preprocess.image2Tensor(resize_tmp,
@@ -328,16 +340,27 @@ void test_img(const std::vector<int>& cluster_id,
                                       layout,
                                       means,
                                       scales);
-
-        t1.end();
-        double tdiff = t1.get_average_ms();
-        to += tdiff;
-        if (tdiff < min_time) {
-          min_time = tdiff;
-        }
+        t_tensor.Stop();
+        t1.Stop();
       }
-      LOG(INFO) << "image trans total time : " << to
-                << ",  avg time : " << to / test_iter;
+      LOG(INFO) << "image convert avg time : " << t_convert.LapTimes().Avg()
+                << ", min time: " << t_convert.LapTimes().Min()
+                << ", max time: " << t_convert.LapTimes().Max();
+      LOG(INFO) << "image resize avg time : " << t_resize.LapTimes().Avg()
+                << ", min time: " << t_resize.LapTimes().Min()
+                << ", max time: " << t_resize.LapTimes().Max();
+      LOG(INFO) << "image rotate avg time : " << t_rotate.LapTimes().Avg()
+                << ", min time: " << t_rotate.LapTimes().Min()
+                << ", max time: " << t_rotate.LapTimes().Max();
+      LOG(INFO) << "image flip avg time : " << t_flip.LapTimes().Avg()
+                << ", min time: " << t_flip.LapTimes().Min()
+                << ", max time: " << t_flip.LapTimes().Max();
+      LOG(INFO) << "image tensor avg time : " << t_tensor.LapTimes().Avg()
+                << ", min time: " << t_tensor.LapTimes().Min()
+                << ", max time: " << t_tensor.LapTimes().Max();
+      LOG(INFO) << "image trans total avg time : " << t1.LapTimes().Avg()
+                << ", min time: " << t1.LapTimes().Min()
+                << ", max time: " << t1.LapTimes().Max();
 
       double max_ratio = 0;
       double max_diff = 0;
@@ -536,7 +559,7 @@ void test_img(const std::vector<int>& cluster_id,
   }
 }
 
-#if 1
+#if 0
 TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
   if (FLAGS_basic_test) {
     for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
@@ -546,19 +569,16 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
             for (auto rotate : {180}) {
               for (auto flip : {0}) {
                 for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) {
-                  for (auto dstFormat : {0, 1, 2, 3}) {
+                  for (auto dstFormat : {0, 1, 2, 3, 4}) {
                     for (auto layout : {1}) {
-                      if ((dstFormat == ImageFormat::GRAY &&
-                           (srcFormat == ImageFormat::RGBA ||
-                            srcFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::GRAY &&
-                           (dstFormat == ImageFormat::RGBA ||
-                            dstFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::NV12 ||
+                      if ((srcFormat == ImageFormat::NV12 ||
                            srcFormat == ImageFormat::NV21) &&
-                              (dstFormat == ImageFormat::GRAY ||
-                               dstFormat == ImageFormat::RGBA ||
-                               dstFormat == ImageFormat::BGRA)) {
+                              (dstFormat == ImageFormat::GRAY)) {
+                        continue;
+                      }
+                      if ((dstFormat == ImageFormat::NV12 ||
+                           dstFormat == ImageFormat::NV21) &&
+                              (srcFormat == ImageFormat::GRAY)) {
                         continue;
                       }
                       if (srcFormat == ImageFormat::NV12 ||
@@ -591,7 +611,7 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
   }
 }
 #endif
-#if 1
+#if 0
 TEST(TestImageConvertRand, test_func_image_resize_preprocess) {
   if (FLAGS_basic_test) {
     for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
@@ -601,21 +621,13 @@ TEST(TestImageConvertRand, test_func_image_resize_preprocess) {
             for (auto rotate : {180}) {
               for (auto flip : {0}) {
                 for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) {
-                  for (auto dstFormat : {0, 1, 2, 3}) {
+                  for (auto dstFormat : {0, 1, 2, 3, 4, 11}) {
                     for (auto layout : {1}) {
                       if (dstFormat == ImageFormat::NV12 ||
-                          dstFormat == ImageFormat::NV21 ||
-                          (dstFormat == ImageFormat::GRAY &&
-                           (srcFormat == ImageFormat::RGBA ||
-                            srcFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::GRAY &&
-                           (dstFormat == ImageFormat::RGBA ||
-                            dstFormat == ImageFormat::BGRA)) ||
+                           dstFormat == ImageFormat::NV21 ||
                           (srcFormat == ImageFormat::NV12 ||
                            srcFormat == ImageFormat::NV21) &&
-                              (dstFormat == ImageFormat::GRAY ||
-                               dstFormat == ImageFormat::RGBA ||
-                               dstFormat == ImageFormat::BGRA)) {
+                              dstFormat == ImageFormat::GRAY) {
                         continue;
                       }
                       if (srcFormat == ImageFormat::NV12 ||
@@ -656,25 +668,10 @@ TEST(TestImageConvertRand, test_func_image_trans_preprocess) {
         for (auto ww : {32, 112}) {
           for (auto hh : {112}) {
             for (auto rotate : {90, 180, 270}) {
-              for (auto flip : {0, 1, 2}) {
-                for (auto srcFormat : {11}) {
-                  for (auto dstFormat : {3}) {
+              for (auto flip : {-1, 0, 1}) {
+                for (auto srcFormat : {0}) {
+                  for (auto dstFormat : {0, 1, 2, 3, 4}) {
                     for (auto layout : {1, 3}) {
-                      if (dstFormat == ImageFormat::NV12 ||
-                          dstFormat == ImageFormat::NV21 ||
-                          (dstFormat == ImageFormat::GRAY &&
-                           (srcFormat == ImageFormat::RGBA ||
-                            srcFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::GRAY &&
-                           (dstFormat == ImageFormat::RGBA ||
-                            dstFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::NV12 ||
-                           srcFormat == ImageFormat::NV21) &&
-                              (dstFormat == ImageFormat::GRAY ||
-                               dstFormat == ImageFormat::RGBA ||
-                               dstFormat == ImageFormat::BGRA)) {
-                        continue;
-                      }
                       if (srcFormat == ImageFormat::NV12 ||
                           srcFormat == ImageFormat::NV21) {
                         if (w % 2) {  // is not ou shu, two line y == one line
@@ -717,7 +714,8 @@ TEST(TestImageConvertCustom, test_func_image_preprocess_custom) {
            (ImageFormat)FLAGS_dstFormat,
            FLAGS_angle,
            (FlipParam)FLAGS_flip_num,
-           (LayoutType)FLAGS_layout);
+           (LayoutType)FLAGS_layout,
+           20);
 }
 #endif
 #endif
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index 549fabab5a20b7757585eacdc2fe4db64e0aaadf..c55f62c02977cec54b1ef679a038e06cb576b6b8 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -1,57 +1,71 @@
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_fc SRCS fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+    lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
 if(LITE_BUILD_EXTRA)
-    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_sum_compute SRCS reduce_sum_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
-    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels}  ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_interp_compute SRCS interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels}  ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc
index 5aaca9083aea5afabf5171d13f666e7bd41d00c1..d049544a7ce198ab88dd588132d1e36c5c721a9b 100644
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -34,7 +34,8 @@ enum activation_type_test {
   LOG,
   EXP,
   FLOOR,
-  RSQRT
+  RSQRT,
+  GELU
 };
 
 class ActivationComputeTester : public arena::TestCase {
@@ -184,6 +185,13 @@ class ActivationComputeTester : public arena::TestCase {
         }
         break;
       }
+      case GELU: {
+        for (int i = 0; i < dims_.production(); i++) {
+          output_data[i] = x_data[i] * 0.5 *
+                           (1.0 + std::erf(x_data[i] * 0.70710678118654752440));
+        }
+        break;
+      }
       default:
         LOG(INFO) << "the type of activation is unknow.";
     }
@@ -243,38 +251,53 @@ class ActivationComputeTester : public arena::TestCase {
 
 TEST(Activation_relu, precision) {
   LOG(INFO) << "test relu op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
 
   for (auto n : {1, 3}) {
     for (auto c : {3, 6}) {
       for (auto h : {9, 18}) {
         for (auto w : {9, 18}) {
-          for (auto slope : {0.01, 0.1}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                0.01,
-                6.,
-                "all",
-                0.,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "relu",
-                RELU));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
+          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+              place,
+              "def",
+              0.01,
+              6.,
+              "all",
+              0.,
+              DDim(std::vector<int64_t>({n, c, h, w})),
+              "relu",
+              RELU));
+          arena::Arena arena(std::move(tester), place, abs_error);
+          arena.TestPrecision();
         }
       }
     }
   }
-#endif
 }
 
 TEST(Activation_leaky_relu, precision) {
   LOG(INFO) << "test leaky_relu op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
 
   for (auto n : {1, 3}) {
     for (auto c : {3, 6}) {
@@ -291,20 +314,27 @@ TEST(Activation_leaky_relu, precision) {
                 DDim(std::vector<int64_t>({n, c, h, w})),
                 "leaky_relu",
                 LEAKY_RELU));
-            arena::Arena arena(std::move(tester), place, 2e-5);
+            arena::Arena arena(std::move(tester), place, abs_error);
             arena.TestPrecision();
           }
         }
       }
     }
   }
-#endif
 }
 
 TEST(Activation_relu_clipped, precision) {
   LOG(INFO) << "test relu clipped op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
 
   for (auto n : {1, 3}) {
     for (auto c : {3, 6}) {
@@ -321,14 +351,13 @@ TEST(Activation_relu_clipped, precision) {
                 DDim(std::vector<int64_t>({n, c, h, w})),
                 "relu_clipped",
                 RELU_CLIPPED));
-            arena::Arena arena(std::move(tester), place, 2e-5);
+            arena::Arena arena(std::move(tester), place, abs_error);
             arena.TestPrecision();
           }
         }
       }
     }
   }
-#endif
 }
 
 TEST(Activation_prelu, precision) {
@@ -363,8 +392,16 @@ TEST(Activation_prelu, precision) {
 
 TEST(Activation_sigmoid, precision) {
   LOG(INFO) << "test sigmoid op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
 
   for (auto n : {1, 3}) {
     for (auto c : {3, 6}) {
@@ -380,19 +417,28 @@ TEST(Activation_sigmoid, precision) {
               DDim(std::vector<int64_t>({n, c, h, w})),
               "sigmoid",
               SIGMOID));
-          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena::Arena arena(std::move(tester), place, abs_error);
           arena.TestPrecision();
         }
       }
     }
   }
-#endif
 }
 
 TEST(Activation_tanh, precision) {
   LOG(INFO) << "test tanh op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
 
   for (auto n : {1, 3}) {
     for (auto c : {3, 6}) {
@@ -408,13 +454,12 @@ TEST(Activation_tanh, precision) {
               DDim(std::vector<int64_t>({n, c, h, w})),
               "tanh",
               TANH));
-          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena::Arena arena(std::move(tester), place, abs_error);
           arena.TestPrecision();
         }
       }
     }
   }
-#endif
 }
 
 TEST(Activation_swish, precision) {
@@ -586,5 +631,25 @@ TEST(Activation_rsqrt, precision) {
   }
 #endif
 }
+
+TEST(Activation_gelu, precision) {
+  LOG(INFO) << "test gelu op";
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+        place, "def", 0.01, 6., "all", 0., DDim(dims), "gelu", GELU));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/tests/kernels/batch_norm_compute_test.cc b/lite/tests/kernels/batch_norm_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae65e0e3c320ff153a99d2a1656227bad34428d4
--- /dev/null
+++ b/lite/tests/kernels/batch_norm_compute_test.cc
@@ -0,0 +1,181 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class BatchNormComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "batch_norm";
+  std::string input_ = "x";
+  std::string scale_ = "scale";
+  std::string bias_ = "bias";
+  std::string mean_ = "mean";
+  std::string variance_ = "variance";
+  std::string output_ = "y";
+  std::string mean_out_ = "mean_out";
+  std::string saved_mean_ = "saved_mean";
+  std::string variance_out_ = "variance_out";
+  std::string saved_variance_ = "saved_variance";
+  DDim dims_{{1, 2, 3, 4}};
+  bool use_global_stats_ = false;
+  float momentum_ = 0.9;
+  float epsilon_ = 1e-5f;
+  std::string data_layout_ = "NCHW";
+  int is_test_ = 1;
+
+ public:
+  BatchNormComputeTest(const Place& place,
+                       const std::string& alias,
+                       DDim dims,
+                       float epsilon)
+      : TestCase(place, alias), dims_(dims), epsilon_(epsilon) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(input_);
+    auto scale = scope->FindTensor(scale_);
+    auto bias = scope->FindTensor(bias_);
+    auto mean = scope->FindTensor(mean_);
+    auto variance = scope->FindTensor(variance_);
+
+    auto y = scope->NewTensor(output_);
+    auto mean_out = scope->NewTensor(mean_out_);
+    auto variance_out = scope->NewTensor(variance_out_);
+    auto saved_mean = scope->NewTensor(saved_mean_);
+    auto saved_variance = scope->NewTensor(saved_variance_);
+    CHECK(y);
+    CHECK(mean_out);
+    CHECK(variance_out);
+    CHECK(saved_mean);
+    CHECK(saved_variance);
+    y->Resize(dims_);
+
+    int64_t channel_size = 0;
+    if (data_layout_ == "NCHW") {
+      channel_size = dims_[1];
+    } else {
+      LOG(FATAL) << "Unknown storage order: " << data_layout_;
+    }
+    mean_out->Resize({channel_size});
+    variance_out->Resize({channel_size});
+    saved_mean->Resize({channel_size});
+    saved_variance->Resize({channel_size});
+
+    auto x_data = x->data<float>();
+    auto y_data = y->mutable_data<float>();
+    auto scale_data = scale->data<float>();
+    auto bias_data = bias->data<float>();
+    auto mean_data = mean->data<float>();
+    auto variance_data = variance->data<float>();
+
+    int64_t outer_size = 0;
+    int64_t inner_size = 0;
+    if (data_layout_ == "NCHW") {
+      outer_size = dims_[0];
+      inner_size = dims_.Slice(2, dims_.size()).production();
+    } else {
+      LOG(FATAL) << "Unknown storage order: " << data_layout_;
+    }
+    auto x_ptr = x_data;
+    auto y_ptr = y_data;
+    for (int o = 0; o < outer_size; o++) {
+      for (int c = 0; c < channel_size; c++) {
+        for (int i = 0; i < inner_size; i++) {
+          float norm_x =
+              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon_);
+          *y_ptr = norm_x * scale_data[c] + bias_data[c];
+          x_ptr++;
+          y_ptr++;
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {input_});
+    op_desc->SetInput("Bias", {bias_});
+    op_desc->SetInput("Scale", {scale_});
+    op_desc->SetInput("Mean", {mean_});
+    op_desc->SetInput("Variance", {variance_});
+    op_desc->SetOutput("Y", {output_});
+    op_desc->SetOutput("MeanOut", {mean_out_});
+    op_desc->SetOutput("VarianceOut", {variance_out_});
+    op_desc->SetOutput("SavedMean", {saved_mean_});
+    op_desc->SetOutput("SavedVariance", {saved_variance_});
+    op_desc->SetAttr("epsilon", epsilon_);
+    op_desc->SetAttr("momentum", momentum_);
+    op_desc->SetAttr("use_global_stats", use_global_stats_);
+    op_desc->SetAttr("data_layout", data_layout_);
+    op_desc->SetAttr("is_test", is_test_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+
+    DDim scale_dim({dims_[1]});
+    std::vector<float> scale(scale_dim.production());
+    fill_data_rand(scale.data(), -1.f, 1.f, scale_dim.production());
+
+    std::vector<float> bias(scale_dim.production());
+    fill_data_rand(bias.data(), -1.f, 1.f, scale_dim.production());
+
+    std::vector<float> mean(scale_dim.production());
+    fill_data_rand(mean.data(), -1.f, 1.f, scale_dim.production());
+
+    std::vector<float> variance(scale_dim.production());
+    fill_data_rand(variance.data(), 0.f, 1.f, scale_dim.production());
+
+    SetCommonTensor(input_, dims_, din.data());
+    SetCommonTensor(scale_, scale_dim, scale.data());
+    SetCommonTensor(bias_, scale_dim, bias.data());
+    SetCommonTensor(mean_, scale_dim, mean.data());
+    SetCommonTensor(variance_, scale_dim, variance.data());
+  }
+};
+
+TEST(BatchNorm, precision) {
+  LOG(INFO) << "test BatchNorm op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#elif defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+#else
+  return;
+#endif
+
+  for (auto dims :
+       std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {5, 6, 7, 8}}) {
+    for (auto epsilon : {1e-5f}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new BatchNormComputeTest(place, "def", DDim(dims), epsilon));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision(
+          {"mean_out", "saved_mean", "variance_out", "saved_variance"});
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/bilinear_interp_compute_test.cc b/lite/tests/kernels/bilinear_interp_compute_test.cc
deleted file mode 100644
index 7ea4293f080df31d9bb05b4998b5b2d9ae7d5a47..0000000000000000000000000000000000000000
--- a/lite/tests/kernels/bilinear_interp_compute_test.cc
+++ /dev/null
@@ -1,374 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <string>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-inline std::vector<int> get_new_shape(
-    std::vector<const lite::Tensor*> list_new_shape_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_shape;
-  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
-    auto tensor = list_new_shape_tensor[i];
-    vec_new_shape.push_back(static_cast<int32_t>(*(tensor->data<int32_t>())));
-  }
-  return vec_new_shape;
-}
-
-template <typename T>
-inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
-  std::vector<T> vec_new_data;
-  auto* new_data = new_data_tensor->data<T>();
-  lite::Tensor cpu_starts_tensor;
-  vec_new_data =
-      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
-  return vec_new_data;
-}
-
-template <typename dtype>
-void resize_bilinear_align(std::vector<const lite::Tensor*> inputs,
-                           lite::Tensor* output) {
-  int hin = inputs[0]->dims()[2];
-  int win = inputs[0]->dims()[3];
-  int channels = inputs[0]->dims()[1];
-  int num = inputs[0]->dims()[0];
-  int hout = output->dims()[2];
-  int wout = output->dims()[3];
-
-  dtype scale_w = static_cast<dtype>(win - 1) / (wout - 1);
-  dtype scale_h = static_cast<dtype>(hin - 1) / (hout - 1);
-  const dtype* src = inputs[0]->data<dtype>();
-  dtype* dst = output->mutable_data<dtype>();
-  int dst_stride_w = 1;
-  int dst_stride_h = wout;
-  int dst_stride_c = wout * hout;
-  int dst_stride_batch = wout * hout * channels;
-  int src_stride_w = 1;
-  int src_stride_h = win;
-  int src_stride_c = win * hin;
-  int src_stride_batch = win * hin * channels;
-
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      int src_index = n * src_stride_batch + c * src_stride_c;
-
-      for (int h = 0; h < hout; ++h) {
-        for (int w = 0; w < wout; ++w) {
-          dtype fw = w * scale_w;
-          dtype fh = h * scale_h;
-          int w_start = static_cast<int>(fw);
-          int w_id = w_start < win - 1 ? 1 : 0;
-          int w_end = static_cast<int>(fw + w_id);
-          int h_start = static_cast<int>(fh);
-          int h_id = h_start < hin - 1 ? 1 : 0;
-          int h_end = static_cast<int>(fh + h_id);
-          fw -= w_start;
-          fh -= h_start;
-          const dtype w00 = (1.0 - fh) * (1.0 - fw);
-          const dtype w01 = fw * (1.0 - fh);
-          const dtype w10 = fh * (1.0 - fw);
-          const dtype w11 = fw * fh;
-          dtype tl =
-              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
-          dtype tr =
-              src[src_index + w_end * src_stride_w + h_start * src_stride_h];
-          dtype bl =
-              src[src_index + w_start * src_stride_w + h_end * src_stride_h];
-          dtype br =
-              src[src_index + w_end * src_stride_w + h_end * src_stride_h];
-          int dst_index = n * dst_stride_batch + c * dst_stride_c +
-                          h * dst_stride_h + w * dst_stride_w;
-          dst[dst_index] =
-              static_cast<dtype>(w00 * tl + w01 * tr + w10 * bl + w11 * br);
-        }
-      }
-    }
-  }
-}
-
-template <typename dtype>
-void resize_bilinear_no_align(std::vector<const lite::Tensor*> inputs,
-                              lite::Tensor* output) {
-  int hin = inputs[0]->dims()[2];
-  int win = inputs[0]->dims()[3];
-  int channels = inputs[0]->dims()[1];
-  int num = inputs[0]->dims()[0];
-  int hout = output->dims()[2];
-  int wout = output->dims()[3];
-  dtype scale_w = static_cast<dtype>(win) / (wout);
-  dtype scale_h = static_cast<dtype>(hin) / (hout);
-  const dtype* src = inputs[0]->data<dtype>();
-  dtype* dst = output->mutable_data<dtype>();
-  int dst_stride_w = 1;
-  int dst_stride_h = wout;
-  int dst_stride_c = wout * hout;
-  int dst_stride_batch = wout * hout * channels;
-  int src_stride_w = 1;
-  int src_stride_h = win;
-  int src_stride_c = win * hin;
-  int src_stride_batch = win * hin * channels;
-
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      int src_index = n * src_stride_batch + c * src_stride_c;
-
-      for (int h = 0; h < hout; ++h) {
-        for (int w = 0; w < wout; ++w) {
-          dtype fw = scale_w * (w + 0.5f) - 0.5f;
-          fw = (fw < 0) ? 0 : fw;
-          dtype fh = scale_h * (h + 0.5f) - 0.5f;
-          fh = (fh < 0) ? 0 : fh;
-          int w_start = static_cast<int>(fw);
-          int w_id = w_start < win - 1 ? 1 : 0;
-          int w_end = static_cast<int>(fw + w_id);
-          int h_start = static_cast<int>(fh);
-          int h_id = h_start < hin - 1 ? 1 : 0;
-          int h_end = static_cast<int>(fh + h_id);
-          fw -= w_start;
-          fh -= h_start;
-          const dtype w00 = (1.0 - fh) * (1.0 - fw);
-          const dtype w01 = fw * (1.0 - fh);
-          const dtype w10 = fh * (1.0 - fw);
-          const dtype w11 = fw * fh;
-          dtype tl =
-              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
-          dtype tr =
-              src[src_index + w_end * src_stride_w + h_start * src_stride_h];
-          dtype bl =
-              src[src_index + w_start * src_stride_w + h_end * src_stride_h];
-          dtype br =
-              src[src_index + w_end * src_stride_w + h_end * src_stride_h];
-          int dst_index = n * dst_stride_batch + c * dst_stride_c +
-                          h * dst_stride_h + w * dst_stride_w;
-          dst[dst_index] =
-              static_cast<dtype>(w00 * tl + w01 * tr + w10 * bl + w11 * br);
-        }
-      }
-    }
-  }
-}
-
-class BilinearInterpComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input0_ = "X";
-  std::string sizetensor0_ = "SizeTensor0";
-  std::string sizetensor1_ = "SizeTensor1";
-  std::string input_scale_ = "Scale";
-  std::string input1_ = "OutSize";
-  std::string output_ = "Out";
-
-  float height_scale_ = 0.f;
-  float width_scale_ = 0.f;
-  int out_height_ = -1;
-  int out_width_ = -1;
-  int outsize_height_ = -1;
-  int outsize_width_ = -1;
-  bool align_corners_ = true;
-  std::string interp_method_ = "Bilinear";
-  DDim _dims0_{{1, 1, 16, 16}};
-  DDim _dims1_{{2}};
-  DDim sizetensor_dims_{{1}};
-  DDim scale_dims_{{1}};
-
- public:
-  BilinearInterpComputeTester(const Place& place,
-                              const std::string& alias,
-                              float scale,
-                              int out_height,
-                              int out_width,
-                              int outsize_height,
-                              int outsize_width,
-                              bool align_corners,
-                              std::string interp_method)
-      : TestCase(place, alias),
-        height_scale_(scale),
-        width_scale_(scale),
-        out_height_(out_height),
-        out_width_(out_width),
-        outsize_height_(outsize_height),
-        outsize_width_(outsize_width),
-        align_corners_(align_corners),
-        interp_method_(interp_method) {}
-
-  void RunBaseline(Scope* scope) override {
-    width_scale_ = height_scale_;
-    std::vector<const lite::Tensor*> inputs;
-    inputs.emplace_back(scope->FindTensor(input0_));
-    if (outsize_height_ > 0 && outsize_width_ > 0) {
-      inputs.emplace_back(scope->FindTensor(input1_));
-    }
-    std::vector<const lite::Tensor*> SizeTensor;
-    if (outsize_height_ > 0 && outsize_width_ > 0) {
-      SizeTensor.emplace_back(scope->FindTensor(sizetensor0_));
-      SizeTensor.emplace_back(scope->FindTensor(sizetensor1_));
-    }
-    const lite::Tensor* input_scale = scope->FindTensor(input_scale_);
-    float scale = height_scale_;
-    int in_h = inputs[0]->dims()[2];
-    int in_w = inputs[0]->dims()[3];
-    if (SizeTensor.size() > 0) {
-      auto new_size = get_new_shape(SizeTensor);
-      out_height_ = new_size[0];
-      out_width_ = new_size[1];
-    } else {
-      auto scale_tensor = input_scale;
-      if (scale_tensor != nullptr) {
-        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-        scale = scale_data[0];
-      }
-      if (scale > 0) {
-        out_height_ = static_cast<int>(in_h * scale);
-        out_width_ = static_cast<int>(in_w * scale);
-      }
-      if (inputs.size() > 1) {
-        auto out_size = inputs[1];
-        auto out_size_data = get_new_data_from_tensor<int>(out_size);
-        out_height_ = out_size_data[0];
-        out_width_ = out_size_data[1];
-      }
-    }
-    height_scale_ = scale;
-    width_scale_ = scale;
-
-    if (out_width_ != -1 && out_height_ != -1) {
-      height_scale_ = static_cast<float>(out_height_ / inputs[0]->dims()[2]);
-      width_scale_ = static_cast<float>(out_width_ / inputs[0]->dims()[3]);
-    }
-    auto* outputs = scope->NewTensor(output_);
-    CHECK(outputs);
-    int num_cout = inputs[0]->dims()[0];
-    int c_cout = inputs[0]->dims()[1];
-    outputs->Resize({num_cout, c_cout, out_height_, out_width_});
-    if (align_corners_) {
-      resize_bilinear_align<float>(inputs, outputs);
-    } else {
-      resize_bilinear_no_align<float>(inputs, outputs);
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("bilinear_interp");
-    op_desc->SetInput("X", {input0_});
-    if (outsize_height_ > 0 && outsize_width_ > 0) {
-      op_desc->SetInput("OutSize", {input1_});
-      op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_});
-    }
-    if (height_scale_ > 0) {
-      op_desc->SetInput("Scale", {input_scale_});
-    }
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("scale", height_scale_);
-    op_desc->SetAttr("out_h", out_height_);
-    op_desc->SetAttr("out_w", out_width_);
-    op_desc->SetAttr("align_corners", align_corners_);
-    op_desc->SetAttr("interp_method", interp_method_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data0(_dims0_.production());
-    for (int i = 0; i < _dims0_.production(); i++) {
-      data0[i] = i * 1.1;
-    }
-    SetCommonTensor(input0_, _dims0_, data0.data());
-
-    if (outsize_height_ > 0 && outsize_width_ > 0) {
-      std::vector<int> data1(2);
-      data1[0] = outsize_height_;
-      data1[1] = outsize_width_;
-      SetCommonTensor(input1_, _dims1_, data1.data());
-
-      std::vector<int> sizetensor_data(1);
-      sizetensor_data[0] = outsize_height_;
-      SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data());
-
-      sizetensor_data[0] = outsize_width_;
-      SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data());
-    }
-
-    if (height_scale_ > 0) {
-      std::vector<float> scale_data(1);
-      scale_data[0] = height_scale_;
-      SetCommonTensor(input_scale_, scale_dims_, scale_data.data());
-    }
-  }
-};
-
-void test_bilinear_interp(Place place) {
-  std::string interp_method = "Bilinear";
-  for (float scale : {2., 1., 0.3}) {
-    for (bool align_corners : {true, false}) {
-      std::unique_ptr<arena::TestCase> tester(new BilinearInterpComputeTester(
-          place, "def", scale, -1, -1, -1, -1, align_corners, interp_method));
-      arena::Arena arena(std::move(tester), place, 5e-5);
-      arena.TestPrecision();
-    }
-  }
-  for (int out_height : {8, 16, 24}) {
-    for (int out_width : {8, 16, 24}) {
-      for (bool align_corners : {true, false}) {
-        std::unique_ptr<arena::TestCase> tester(
-            new BilinearInterpComputeTester(place,
-                                            "def",
-                                            0,
-                                            out_height,
-                                            out_width,
-                                            -1,
-                                            -1,
-                                            align_corners,
-                                            interp_method));
-        arena::Arena arena(std::move(tester), place, 5e-5);
-        arena.TestPrecision();
-      }
-    }
-  }
-  for (int outsize_height : {8, 16, 24}) {
-    for (int outsize_width : {8, 16, 24}) {
-      for (bool align_corners : {true, false}) {
-        std::unique_ptr<arena::TestCase> tester(
-            new BilinearInterpComputeTester(place,
-                                            "def",
-                                            0,
-                                            -1,
-                                            -1,
-                                            outsize_height,
-                                            outsize_width,
-                                            align_corners,
-                                            interp_method));
-        arena::Arena arena(std::move(tester), place, 5e-5);
-        arena.TestPrecision();
-      }
-    }
-  }
-}
-
-TEST(BilinearInterp, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_bilinear_interp(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/cast_compute_test.cc b/lite/tests/kernels/cast_compute_test.cc
index 7c83aed164260b8d16c1d9f5bf8e63fbe08fc4eb..a7316a6162ed9a1bbbaf4956d51ab19c017fd3e4 100644
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
@@ -22,12 +22,13 @@ namespace lite {
 
 class CastComputeTester : public arena::TestCase {
  protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
+  std::string x_ = "x";
+  std::string out_ = "out";
+  // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
+  // SIZE_T = 19;UINT8 = 20;INT8 = 21;
   int in_dtype_;
   int out_dtype_;
-  DDim x_dims_{{2, 2}};
+  DDim dims_{{2, 2}};
 
  public:
   CastComputeTester(const Place& place,
@@ -36,90 +37,148 @@ class CastComputeTester : public arena::TestCase {
                     int out_dtype)
       : TestCase(place, alias), in_dtype_(in_dtype), out_dtype_(out_dtype) {}
 
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
+  template <typename T1, typename T2>
+  void RunBaselineHelper(Scope* scope) {
+    auto* x = scope->FindTensor(x_);
+    auto* x_data = x->data<T1>();
+    auto* out = scope->NewTensor(out_);
     CHECK(out);
-    out->Resize(x_dims_);
+    out->Resize(dims_);
+    auto* out_data = out->mutable_data<T2>();
+    for (int i = 0; i < dims_.production(); i++) {
+      *out_data = static_cast<T2>(*x_data);
+      out_data++;
+      x_data++;
+    }
+  }
 
-    if (out_dtype_ == 5 && in_dtype_ == 20) {
-      auto* x = scope->FindTensor(input_);
-      auto* x_data = x->data<unsigned char>();
-      auto* output_data = out->mutable_data<float>();
-      for (int i = 0; i < x_dims_.production(); i++) {
-        *output_data = static_cast<float>(*x_data);
-        output_data++;
-        x_data++;
-      }
-    } else if (out_dtype_ == 5 && in_dtype_ == 21) {
-      auto* output_data = out->mutable_data<float>();
-      auto* x = scope->FindTensor(input_);
-      auto* x_data = x->data<char>();
-      for (int i = 0; i < x_dims_.production(); i++) {
-        *output_data = static_cast<float>(*x_data);
-        output_data++;
-        x_data++;
-      }
-    } else if (out_dtype_ == 5 && in_dtype_ == 2) {
-      auto* output_data = out->mutable_data<float>();
-      auto* x = scope->FindTensor(input_);
-      auto* x_data = x->data<int32_t>();
-      for (int i = 0; i < x_dims_.production(); i++) {
-        *output_data = static_cast<float>(*x_data);
-        output_data++;
-        x_data++;
-      }
+  void RunBaseline(Scope* scope) override {
+    if (in_dtype_ == 20 && out_dtype_ == 5) {
+      RunBaselineHelper<uint8_t, float>(scope);
+    } else if (in_dtype_ == 2 && out_dtype_ == 5) {
+      RunBaselineHelper<int32_t, float>(scope);
+    } else if (in_dtype_ == 3 && out_dtype_ == 5) {
+      RunBaselineHelper<int64_t, float>(scope);
+    } else if (in_dtype_ == 5 && out_dtype_ == 3) {
+      RunBaselineHelper<float, int64_t>(scope);
+    } else if (in_dtype_ == 21 && out_dtype_ == 5) {
+      RunBaselineHelper<int8_t, float>(scope);
+    } else if (in_dtype_ == 5 && out_dtype_ == 21) {
+      RunBaselineHelper<float, int8_t>(scope);
+    } else {
+      LOG(FATAL) << "unsupported";
     }
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("cast");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
     op_desc->SetAttr("in_dtype", in_dtype_);
     op_desc->SetAttr("out_dtype", out_dtype_);
   }
 
+  template <typename T1>
+  void PrepareDataHelper() {
+    std::vector<T1> x_data(dims_.production());
+    for (int i = 0; i < dims_.production(); i++) {
+      x_data[i] = static_cast<T1>(i % 128);
+    }
+    SetCommonTensor(x_, dims_, x_data.data());
+  }
+
   void PrepareData() override {
-    if (in_dtype_ == 20) {
-      std::vector<unsigned char> x_data(x_dims_.production());
-      for (int i = 0; i < x_dims_.production(); i++) {
-        x_data[i] = static_cast<unsigned char>(i % 128);
-      }
-      SetCommonTensor(input_, x_dims_, x_data.data());
-    } else if (in_dtype_ == 21) {
-      std::vector<char> x_data(x_dims_.production());
-      for (int i = 0; i < x_dims_.production(); i++) {
-        float sign = i % 3 == 0 ? -1.0f : 1.0f;
-        x_data[i] = sign * static_cast<char>(i % 128);
-      }
-      SetCommonTensor(input_, x_dims_, x_data.data());
-    } else if (in_dtype_ == 2) {
-      std::vector<int32_t> x_data(x_dims_.production());
-      for (int i = 0; i < x_dims_.production(); i++) {
-        int sign = i % 3 == 0 ? -1 : 1;
-        x_data[i] = sign * static_cast<int32_t>(i % 128);
-      }
-      SetCommonTensor(input_, x_dims_, x_data.data());
-    } else {
-      LOG(FATAL) << "not implemented!";
+    // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
+    // SIZE_T = 19;UINT8 = 20;INT8 = 21;
+    switch (in_dtype_) {
+      case 20:
+        PrepareDataHelper<uint8_t>();
+        break;
+      case 21:
+        PrepareDataHelper<int8_t>();
+        break;
+      case 1:
+        PrepareDataHelper<int16_t>();
+        break;
+      case 2:
+        PrepareDataHelper<int32_t>();
+        break;
+      case 3:
+        PrepareDataHelper<int64_t>();
+        break;
+      case 5:
+        PrepareDataHelper<float>();
+        break;
+      case 6:
+        PrepareDataHelper<double>();
+        break;
+      case 19:
+        PrepareDataHelper<size_t>();
+        break;
+      default:
+        LOG(FATAL) << "unsupported data type: " << in_dtype_;
+        break;
+    }
+
+    PrecisionType out_ptype;
+    switch (out_dtype_) {
+      case 0:
+        out_ptype = PRECISION(kBool);
+        break;
+      case 21:
+        out_ptype = PRECISION(kInt8);
+        break;
+      case 1:
+        out_ptype = PRECISION(kInt16);
+        break;
+      case 2:
+        out_ptype = PRECISION(kInt32);
+        break;
+      case 3:
+        out_ptype = PRECISION(kInt64);
+        break;
+      case 4:
+        out_ptype = PRECISION(kFP16);
+        break;
+      case 5:
+        out_ptype = PRECISION(kFloat);
+        break;
+      default:
+        LOG(FATAL) << "unsupported data type: " << out_dtype_;
+        break;
     }
+    SetPrecisionType(out_, out_ptype);
   }
 };
 
-TEST(Cast, precision) {
-  LOG(INFO) << "test cast op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
+void TestCast(Place place, float abs_error, int in_dtype, int out_dtype) {
   std::unique_ptr<arena::TestCase> tester(
-      new CastComputeTester(place, "def", 20, 5));
-  arena::Arena arena(std::move(tester), place, 2e-5);
+      new CastComputeTester(place, "def", in_dtype, out_dtype));
+  arena::Arena arena(std::move(tester), place, abs_error);
   arena.TestPrecision();
+}
 
-  std::unique_ptr<arena::TestCase> tester1(
-      new CastComputeTester(place, "def", 2, 5));
-  arena::Arena arena1(std::move(tester1), place, 2e-5);
-  arena1.TestPrecision();
+TEST(Cast, precision) {
+  LOG(INFO) << "test cast op";
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+// BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
+// SIZE_T = 19;UINT8 = 20;INT8 = 21;
+#ifndef LITE_WITH_XPU
+  TestCast(place, abs_error, 20, 5);
+#endif
+  TestCast(place, abs_error, 2, 5);
+#ifdef LITE_WITH_XPU
+  TestCast(place, abs_error, 3, 5);
+  TestCast(place, abs_error, 5, 3);
 #endif
 }
 
diff --git a/lite/tests/kernels/concat_compute_test.cc b/lite/tests/kernels/concat_compute_test.cc
index e0ae4c282841624dfbce9de086994762a7226b73..3e30035f1011405ad9beffefd0df91132747a609 100644
--- a/lite/tests/kernels/concat_compute_test.cc
+++ b/lite/tests/kernels/concat_compute_test.cc
@@ -142,35 +142,29 @@ class ConcateComputeTester : public arena::TestCase {
 
 TEST(Concat, precision) {
   LOG(INFO) << "test concat op, kARM";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  for (int axis : {1, 2}) {
-    for (bool is_use_axis_tensor : {false, true}) {
-      LOG(INFO) << "axis:" << axis
-                << ", is_use_axis_tensor:" << is_use_axis_tensor;
-      std::unique_ptr<arena::TestCase> tester(
-          new ConcateComputeTester(place, "def", axis, is_use_axis_tensor));
-      arena::Arena arena(std::move(tester), place, 2e-5);
-      arena.TestPrecision();
-    }
-  }
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_X86)
+  place = TARGET(kX86);
+#else
+  return;
 #endif
 
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-  LOG(INFO) << "test concate op, x86";
   for (int axis : {1, 2}) {
     for (bool is_use_axis_tensor : {false, true}) {
       LOG(INFO) << "axis:" << axis
                 << ", is_use_axis_tensor:" << is_use_axis_tensor;
       std::unique_ptr<arena::TestCase> tester(
           new ConcateComputeTester(place, "def", axis, is_use_axis_tensor));
-      arena::Arena arena(std::move(tester), place, 2e-5);
+      arena::Arena arena(std::move(tester), place, abs_error);
       arena.TestPrecision();
     }
   }
-
-#endif
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/conv2d_transpose_compute_test.cc b/lite/tests/kernels/conv2d_transpose_compute_test.cc
deleted file mode 100644
index a287f0bb6610921e0f048fcc4d46f8729dd177c1..0000000000000000000000000000000000000000
--- a/lite/tests/kernels/conv2d_transpose_compute_test.cc
+++ /dev/null
@@ -1,464 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
-  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
-}
-
-template <typename Dtype>
-void col2im(const Dtype* data_col,
-            const int channels,
-            const int height,
-            const int width,
-            const int kernel_h,
-            const int kernel_w,
-            const int pad_h,
-            const int pad_w,
-            const int stride_h,
-            const int stride_w,
-            const int dilation_h,
-            const int dilation_w,
-            Dtype* data_im) {
-  memset(data_im, 0, height * width * channels * sizeof(float));
-  const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-  const int channel_size = height * width;
-  for (int channel = channels; channel--; data_im += channel_size) {
-    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
-        for (int output_rows = output_h; output_rows; output_rows--) {
-          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
-            data_col += output_w;
-          } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
-            for (int output_col = output_w; output_col; output_col--) {
-              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
-                data_im[input_row * width + input_col] += *data_col;
-              }
-              data_col++;
-              input_col += stride_w;
-            }
-          }
-          input_row += stride_h;
-        }
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void fill_bias_relu(Dtype* tensor,
-                    const Dtype* bias,
-                    int channel,
-                    int channel_size,
-                    bool flag_bias,
-                    bool flag_relu);
-
-template <>
-void fill_bias_relu<float>(float* tensor,
-                           const float* bias,
-                           int channel,
-                           int channel_size,
-                           bool flag_bias,
-                           bool flag_relu) {
-  float* data = tensor;
-  if (flag_relu) {
-    for (int j = 0; j < channel; ++j) {
-      float bias_data = flag_bias ? bias[j] : 0.f;
-      for (int i = 0; i < channel_size; i++) {
-        data[i] += bias_data;
-        data[i] = data[i] > 0 ? data[i] : 0.f;
-      }
-      data += channel_size;
-    }
-  } else {
-    for (int j = 0; j < channel; ++j) {
-      float bias_data = flag_bias ? bias[j] : 0.f;
-      for (int i = 0; i < channel_size; i++) {
-        data[i] += bias_data;
-      }
-      data += channel_size;
-    }
-  }
-}
-
-template <typename type, typename type2>
-static void basic_gemm(int m,
-                       int n,
-                       int k,
-                       const type* a,
-                       const type* b,
-                       const type2* bias,
-                       type2* c,
-                       type2 alpha,
-                       type2 beta,
-                       bool trans_a = false,
-                       bool trans_b = false,
-                       bool flag_bias = false,
-                       bool flag_relu = false) {
-#pragma omp parallel for
-  for (int i = 0; i < m; ++i) {
-    type2 bias_data = (type2)0;
-    if (flag_bias) {
-      bias_data = bias[i];
-    }
-    for (int j = 0; j < n; ++j) {
-      type2 sum = static_cast<type2>(0);
-      for (int l = 0; l < k; ++l) {
-        type av;
-        type bv;
-        if (trans_a) {
-          av = a[l * m + i];
-        } else {
-          av = a[i * k + l];
-        }
-        if (trans_b) {
-          bv = b[j * k + l];
-        } else {
-          bv = b[l * n + j];
-        }
-        sum += av * bv;
-      }
-      type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data;
-      if (flag_relu) {
-        c[i * n + j] = tmp > (type2)0 ? tmp : (type2)0;
-      } else {
-        c[i * n + j] = tmp;
-      }
-    }
-  }
-}
-
-//! for float, dtype1 and type2 is float
-//! for int8, dytpe1 is char, dtype2 is int
-template <typename Dtype1, typename Dtype2>
-bool deconv_basic(const Dtype1* din,
-                  Dtype2* dout,
-                  int num,
-                  int chout,
-                  int hout,
-                  int wout,
-                  int chin,
-                  int hin,
-                  int win,
-                  const Dtype1* weights,
-                  const Dtype2* bias,
-                  int group,
-                  int kernel_w,
-                  int kernel_h,
-                  int stride_w,
-                  int stride_h,
-                  int dila_w,
-                  int dila_h,
-                  int pad_w,
-                  int pad_h,
-                  bool flag_bias,
-                  bool flag_relu) {
-  int m = chout * kernel_w * kernel_h / group;
-  int n = hin * win;
-  int k = chin / group;
-
-  if (chin != chout || group != chin) {
-    CHECK_OR_FALSE(chin % group == 0);
-    CHECK_OR_FALSE(chout % group == 0);
-  }
-  lite::Tensor workspace_tensor;
-  std::vector<int64_t> wt_shape = {1, 1, 1, group * m * n};
-  workspace_tensor.Resize(wt_shape);
-  auto* workspace_ptr = workspace_tensor.mutable_data<Dtype2>();
-
-  int group_size_in = win * hin * chin / group;
-  int group_size_coldata = m * n;
-  int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
-  bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
-
-  for (int i = 0; i < num; ++i) {
-    const Dtype1* din_batch = din + i * chin * hin * win;
-    Dtype2* dout_batch = dout + i * chout * hout * wout;
-
-    Dtype2* col_data = workspace_ptr;
-    if (flag_1x1s1p1) {
-      col_data = dout_batch;
-    }
-    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata);
-    for (int g = 0; g < group; ++g) {
-      const Dtype1* din_group = din_batch + g * group_size_in;
-      const Dtype1* weights_group = weights + g * group_size_weights;
-      Dtype2* coldata_group = col_data + g * group_size_coldata;
-      basic_gemm<Dtype1, Dtype2>(m,
-                                 n,
-                                 k,
-                                 weights_group,
-                                 din_group,
-                                 nullptr,
-                                 coldata_group,
-                                 (Dtype2)1,
-                                 (Dtype2)0,
-                                 true,
-                                 false,
-                                 false,
-                                 (!flag_bias && flag_relu));
-    }
-    if (!flag_1x1s1p1) {
-      col2im(col_data,
-             chout,
-             hout,
-             wout,
-             kernel_h,
-             kernel_w,
-             pad_h,
-             pad_w,
-             stride_h,
-             stride_w,
-             dila_h,
-             dila_w,
-             dout_batch);
-    }
-    if (flag_bias) {
-      fill_bias_relu(
-          dout_batch, bias, chout, wout * hout, flag_bias, flag_relu);
-    }
-  }
-  return true;
-}
-
-class Conv2DTransposeComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string x_ = "x";
-  std::string output_ = "out";
-  std::string filter_ = "filter";
-  std::string bias_ = "bias";
-
-  std::vector<int> strides_{1, 1};
-  std::vector<int> paddings_{0, 0};
-  int groups_{1};
-  std::vector<int> dilations_{1, 1};
-  bool flag_relu_{false};
-
-  int n_ = 1;
-  int ic_ = 1;
-  int oc_ = 1;
-  int ih_ = 9;
-  int iw_ = 9;
-  bool flag_bias_ = false;
-  int ks_ = 1;
-
- public:
-  Conv2DTransposeComputeTester(const Place& place,
-                               const std::string& alias,
-                               int n,
-                               int ic,
-                               int oc,
-                               int ih,
-                               int iw,
-                               bool flag_bias,
-                               bool flag_relu,
-                               int dilation,
-                               int stride,
-                               int padding,
-                               int ks,
-                               int groups)
-      : TestCase(place, alias) {
-    n_ = n;
-    ic_ = ic;
-    oc_ = oc;
-    ih_ = ih;
-    iw_ = iw;
-    ks_ = ks;
-    flag_bias_ = flag_bias;
-
-    strides_ = std::vector<int>({stride, stride});
-    paddings_ = std::vector<int>({padding, padding});
-    groups_ = groups;
-    dilations_ = std::vector<int>({dilation, dilation});
-    flag_relu_ = flag_relu;
-  }
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    int oh = (ih_ - 1) * strides_[0] - 2 * paddings_[0] +
-             dilations_[0] * (ks_ - 1) + 1;
-    int ow = (iw_ - 1) * strides_[1] - 2 * paddings_[1] +
-             dilations_[1] * (ks_ - 1) + 1;
-    CHECK(oh > 0 || ow > 0);
-
-    std::vector<int64_t> output_shape = {n_, oc_, oh, ow};
-    DDim output_dims(output_shape);
-    out->Resize(output_dims);
-    auto* output_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(x_);
-    const auto* x_data = x->data<float>();
-    auto* filter = scope->FindTensor(filter_);
-    const auto* filter_data = filter->data<float>();
-    const float* bias_data = nullptr;
-    if (flag_bias_) {
-      auto* bias = scope->FindTensor(bias_);
-      bias_data = bias->data<float>();
-    }
-
-    deconv_basic<float, float>(x_data,
-                               output_data,
-                               n_,
-                               oc_,
-                               oh,
-                               ow,
-                               ic_,
-                               ih_,
-                               iw_,
-                               filter_data,
-                               bias_data,
-                               groups_,
-                               ks_,
-                               ks_,
-                               strides_[1],
-                               strides_[0],
-                               dilations_[1],
-                               dilations_[0],
-                               paddings_[1],
-                               paddings_[0],
-                               flag_bias_,
-                               flag_relu_);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("conv2d_transpose");
-    op_desc->SetInput("Input", {x_});
-    op_desc->SetInput("Filter", {filter_});
-    op_desc->SetOutput("Output", {output_});
-    op_desc->SetAttr("strides", strides_);
-    op_desc->SetAttr("paddings", paddings_);
-    op_desc->SetAttr("groups", groups_);
-    op_desc->SetAttr("dilations", dilations_);
-    if (flag_bias_) {
-      op_desc->SetInput("Bias", {bias_});
-    }
-    op_desc->SetAttr("fuse_relu", flag_relu_);
-  }
-
-  void PrepareData() override {
-    std::vector<int64_t> input_shape = {n_, ic_, ih_, iw_};
-    std::vector<int64_t> filter_shape = {ic_, oc_ / groups_, ks_, ks_};
-    std::vector<int64_t> bias_shape = {1, oc_, 1, 1};
-
-    // x tensor
-    DDim x_dims(input_shape);
-    std::vector<float> x_data(x_dims.production());
-    for (int i = 0; i < x_dims.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      x_data[i] = sign * static_cast<float>(i % 128) * 0.013f + 0.001;
-    }
-    SetCommonTensor(x_, x_dims, x_data.data());
-
-    // filter tensor
-    DDim filter_dims(filter_shape);
-    std::vector<float> filter_data(filter_dims.production());
-    for (int i = 0; i < filter_dims.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      filter_data[i] = sign * static_cast<float>(i % 128) * 0.01f + 0.001;
-    }
-    SetCommonTensor(filter_, filter_dims, filter_data.data());
-
-    // bias tensor
-    if (flag_bias_) {
-      DDim bias_dims(bias_shape);
-      std::vector<float> bias_data(bias_dims.production());
-      for (int i = 0; i < bias_dims.production(); i++) {
-        float sign = i % 3 == 0 ? -1.0f : 1.0f;
-        bias_data[i] = sign * static_cast<float>(i % 128) * 0.01f + 0.001;
-      }
-      SetCommonTensor(bias_, bias_dims, bias_data.data());
-    }
-  }
-};
-
-TEST(conv2d_transpose, precision) {
-  LOG(INFO) << "test conv2d_transpose op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  for (auto n : {1, 2}) {
-    for (auto ic : {1, 4 /*, 128*/}) {
-      for (auto oc : {1, 4 /*, 128*/}) {
-        LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" << oc;
-        for (auto ih : {8, 16 /*, 56 , 112, 224, 512*/}) {
-          for (auto iw : {8, 16 /*, 56, 112, 224, 512*/}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto dilation : {1, 2}) {
-                  for (auto stride : {1, 2}) {
-                    for (auto padding : {0, 2}) {
-                      for (auto ks : {2, 5}) {
-                        for (auto group : {1, 2}) {
-                          // obtain shape
-                          // LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" <<
-                          // oc
-                          //           << ",ih:" << ih << ",iw:" << iw
-                          //           << ",flag_bias:" << flag_bias
-                          //           << ",flag_relu:" << flag_relu
-                          //           << ",dila:" << dilation
-                          //           << ",stride:" << stride
-                          //           << ",padding:" << padding << ",ks:" << ks
-                          //           << ",group:" << group;
-                          if (ic % group != 0 || oc % group != 0) {
-                            group = 1;
-                          }
-                          std::unique_ptr<arena::TestCase> tester(
-                              new Conv2DTransposeComputeTester(place,
-                                                               "def",
-                                                               n,
-                                                               ic,
-                                                               oc,
-                                                               ih,
-                                                               iw,
-                                                               flag_bias,
-                                                               flag_relu,
-                                                               dilation,
-                                                               stride,
-                                                               padding,
-                                                               ks,
-                                                               group));
-                          arena::Arena arena(std::move(tester), place, 2e-5);
-                          arena.TestPrecision();
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/conv_compute_test.cc b/lite/tests/kernels/conv_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4442fe47e3a6410aa921d163ef0257602cce2fbc
--- /dev/null
+++ b/lite/tests/kernels/conv_compute_test.cc
@@ -0,0 +1,431 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class ConvComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "conv2d";
+  std::string input_ = "input";
+  std::string filter_ = "filter";
+  std::string output_ = "output";
+  DDim dims_;
+
+  int out_channels_ = 1;
+  int ksize_ = 3;
+  std::vector<int> strides_{1, 1};
+  std::vector<int> paddings_{0, 0};
+  int groups_ = 1;
+  std::vector<int> dilations_{1, 1};
+  std::string padding_algorithm_;
+  bool with_bias_ = false;
+  std::string bias_ = "bias";
+  bool with_act_ = false;
+  std::string act_type_;
+  float leaky_relu_alpha_ = 0.1;
+
+ public:
+  ConvComputeTester(const Place& place,
+                    const std::string& alias,
+                    DDim dims,
+                    int out_channels = 1,
+                    int ksize = 3,
+                    std::vector<int> strides = {1, 1},
+                    std::vector<int> paddings = {0, 0},
+                    int groups = 1,
+                    std::vector<int> dilations = {1, 1},
+                    std::string padding_algorithm = "",
+                    bool with_bias = false,
+                    bool with_act = false,
+                    std::string act_type = "",
+                    float leaky_relu_alpha = 0.1)
+      : TestCase(place, alias),
+        dims_(dims),
+        out_channels_(out_channels),
+        ksize_(ksize),
+        strides_(strides),
+        paddings_(paddings),
+        groups_(groups),
+        dilations_(dilations),
+        padding_algorithm_(padding_algorithm),
+        with_bias_(with_bias),
+        with_act_(with_act),
+        act_type_(act_type),
+        leaky_relu_alpha_(leaky_relu_alpha) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* input = scope->FindTensor(input_);
+    auto* filter = scope->FindTensor(filter_);
+    auto input_dims = input->dims();
+    auto filter_dims = filter->dims();
+
+    auto* output = scope->NewTensor(output_);
+    CHECK(output);
+
+    if (paddings_.size() == 2L) {
+      paddings_.insert(paddings_.begin(), paddings_[0]);
+      paddings_.insert(paddings_.begin() + 2, paddings_[2]);
+    }
+    if (padding_algorithm_ == "SAME") {
+      for (size_t i = 0; i < strides_.size(); ++i) {
+        int out_size = (input_dims[i + 2] + strides_[i] - 1) / strides_[i];
+        int pad_sum =
+            std::max((out_size - 1) * strides_[i] + ksize_ - input_dims[i + 2],
+                     (int64_t)0);
+        int pad_0 = pad_sum / 2;
+        int pad_1 = pad_sum - pad_0;
+        // pad
+        *(paddings_.begin() + i * 2) = pad_0;
+        *(paddings_.begin() + i * 2 + 1) = pad_1;
+        // dilation
+        *(dilations_.begin() + i) = 1;
+      }
+    } else if (padding_algorithm_ == "VALID") {
+      for (auto& it : paddings_) {
+        it = 0;
+      }
+    }
+    std::vector<int64_t> output_shape({input_dims[0], filter_dims[0]});
+    for (size_t i = 0; i < strides_.size(); ++i) {
+      const int dkernel = dilations_[i] * (filter_dims[i + 2] - 1) + 1;
+      int output_size = (input_dims[i + 2] +
+                         (paddings_[i * 2] + paddings_[i * 2 + 1]) - dkernel) /
+                            strides_[i] +
+                        1;
+      output_shape.push_back(output_size);
+    }
+    output->Resize(DDim(output_shape));
+    auto output_dims = output->dims();
+
+    auto input_data = input->data<float>();
+    auto filter_data = filter->data<float>();
+    auto output_data = output->mutable_data<float>();
+    int kernel_w = filter_dims[3];
+    int kernel_h = filter_dims[2];
+    int stride_w = strides_[1];
+    int stride_h = strides_[0];
+    int dila_w = dilations_[1];
+    int dila_h = dilations_[0];
+    int pad_w = paddings_[2];
+    int pad_h = paddings_[0];
+    int batch_size = input_dims[0];
+    int in_ch_size = input_dims[1];
+    int in_h = input_dims[2];
+    int in_w = input_dims[3];
+    int out_ch_size = output_dims[1];
+    int out_h = output_dims[2];
+    int out_w = output_dims[3];
+    int out_c_group = out_ch_size / groups_;
+    int in_c_group = in_ch_size / groups_;
+
+    const float* bias_data = nullptr;
+    bool is_channel_bias = true;
+    if (with_bias_) {
+      auto bias = scope->FindTensor(bias_);
+      bias_data = bias->data<float>();
+    }
+    for (int n = 0; n < batch_size; ++n) {
+      for (int g = 0; g < groups_; ++g) {
+        for (int oc = 0; oc < out_c_group; ++oc) {
+          for (int oh = 0; oh < out_h; ++oh) {
+            for (int ow = 0; ow < out_w; ++ow) {
+              int out_idx = n * groups_ * out_c_group * out_h * out_w +
+                            g * out_c_group * out_h * out_w +
+                            oc * out_h * out_w + oh * out_w + ow;
+              float out_value =
+                  bias_data != nullptr
+                      ? (is_channel_bias ? bias_data[g * out_c_group + oc]
+                                         : bias_data[out_idx])
+                      : 0;
+              // + out_value *= beta;
+              for (int ic = 0; ic < in_c_group; ++ic) {
+                for (int kh = 0; kh < kernel_h; ++kh) {
+                  for (int kw = 0; kw < kernel_w; ++kw) {
+                    int iw = ow * stride_w - pad_w + kw * (dila_w);
+                    int ih = oh * stride_h - pad_h + kh * (dila_h);
+                    if (iw < 0 || iw >= in_w) continue;
+                    if (ih < 0 || ih >= in_h) continue;
+                    int in_idx = n * in_ch_size * in_h * in_w +
+                                 g * in_c_group * in_h * in_w +
+                                 ic * in_h * in_w + ih * in_w + iw;
+                    int filter_idx =
+                        g * out_c_group * in_c_group * kernel_h * kernel_w +
+                        oc * in_c_group * kernel_h * kernel_w +
+                        ic * kernel_h * kernel_w + kh * kernel_w + kw;
+                    out_value += input_data[in_idx] * filter_data[filter_idx];
+                  }
+                }
+              }
+              if (with_act_) {
+                if (act_type_ == "relu") {
+                  out_value = out_value > 0 ? out_value : 0;
+                } else if (act_type_ == "leaky_relu") {
+                  out_value =
+                      std::max(out_value, out_value * leaky_relu_alpha_);
+                } else {
+                  LOG(FATAL) << "unsupported";
+                }
+              }
+              output_data[out_idx] = out_value;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("Input", {input_});
+    op_desc->SetInput("Filter", {filter_});
+    if (with_bias_) {
+      op_desc->SetInput("Bias", {bias_});
+    }
+    op_desc->SetOutput("Output", {output_});
+    op_desc->SetAttr("strides", strides_);
+    op_desc->SetAttr("paddings", paddings_);
+    op_desc->SetAttr("groups", groups_);
+    op_desc->SetAttr("dilations", dilations_);
+    if (!padding_algorithm_.empty()) {
+      op_desc->SetAttr("padding_algorithm", padding_algorithm_);
+    }
+    if (with_act_) {
+      op_desc->SetAttr("with_act", with_act_);
+      op_desc->SetAttr("act_type", act_type_);
+      if (act_type_ == "leaky_relu") {
+        op_desc->SetAttr("leaky_relu_alpha", leaky_relu_alpha_);
+      }
+    }
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
+
+    DDim filter_dims(std::vector<int64_t>{
+        out_channels_, dims_[1] / groups_, ksize_, ksize_});
+    std::vector<float> dfilter(filter_dims.production());
+    fill_data_rand(dfilter.data(), -1.f, 1.f, filter_dims.production());
+    SetCommonTensor(filter_, filter_dims, dfilter.data(), {}, true);
+
+    if (with_bias_) {
+      DDim bias_dims(std::vector<int64_t>{out_channels_});
+      std::vector<float> dbias(bias_dims.production());
+      fill_data_rand(din.data(), -1.f, 1.f, bias_dims.production());
+      SetCommonTensor(bias_, bias_dims, dbias.data(), {}, true);
+    }
+  }
+};
+
+void TestConvKsize(Place place, float abs_error = 2e-5) {
+  for (auto dims :
+       std::vector<std::vector<int64_t>>{{1, 2, 7, 8}, {5, 6, 17, 18}}) {
+    for (auto out_channels : {1, 3}) {
+      for (auto ksize : {1, 3, 5, 7}) {
+        std::unique_ptr<arena::TestCase> tester(new ConvComputeTester(
+            place, "def", DDim(dims), out_channels, ksize));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestConvGroups(Place place, float abs_error = 2e-5) {
+  for (auto dims :
+       std::vector<std::vector<int64_t>>{{1, 6, 3, 4}, {5, 12, 7, 8}}) {
+    for (auto out_channels : {2, 3, 6}) {
+      for (auto groups : {2, 3, 6}) {
+#ifdef LITE_WITH_NPU
+        if (out_channels % groups != 0) continue;
+#endif
+        std::unique_ptr<arena::TestCase> tester(new ConvComputeTester(
+            place, "def", DDim(dims), out_channels, 3, {1, 1}, {0, 0}, groups));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestConvDilations(Place place, float abs_error = 2e-5) {
+  for (auto dims :
+       std::vector<std::vector<int64_t>>{{1, 2, 5, 6}, {5, 6, 9, 10}}) {
+    for (auto out_channels : {1, 3}) {
+      for (auto dilations : std::vector<std::vector<int>>{{2, 2}, {1, 2}}) {
+        std::unique_ptr<arena::TestCase> tester(
+            new ConvComputeTester(place,
+                                  "def",
+                                  DDim(dims),
+                                  out_channels,
+                                  3,
+                                  {1, 1},
+                                  {0, 0},
+                                  1,
+                                  dilations));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestConvStrides(Place place, float abs_error = 2e-5) {
+  for (auto dims :
+       std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {5, 6, 7, 8}}) {
+    for (auto out_channels : {1, 3}) {
+      for (auto strides :
+           std::vector<std::vector<int>>{{2, 2}, {3, 3}, {1, 2}, {3, 1}}) {
+        std::unique_ptr<arena::TestCase> tester(new ConvComputeTester(
+            place, "def", DDim(dims), out_channels, 3, strides));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestConvPaddings(Place place, float abs_error = 2e-5) {
+  for (auto dims :
+       std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {5, 6, 7, 8}}) {
+    for (auto out_channels : {1, 3}) {
+      for (auto paddings : std::vector<std::vector<int>>{
+               {1, 1}, {2, 2}, {1, 0, 0, 1}, {1, 2, 0, 1}}) {
+        std::unique_ptr<arena::TestCase> tester(new ConvComputeTester(
+            place, "def", DDim(dims), out_channels, 3, {1, 1}, paddings));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestConvPaddingAlgorithm(Place place, float abs_error = 2e-5) {
+  for (auto dims :
+       std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {5, 6, 7, 8}}) {
+    for (auto out_channels : {1, 3}) {
+      for (auto padding_algorithm : std::vector<std::string>{"VALID", "SAME"}) {
+        std::unique_ptr<arena::TestCase> tester(
+            new ConvComputeTester(place,
+                                  "def",
+                                  DDim(dims),
+                                  out_channels,
+                                  3,
+                                  {1, 1},
+                                  {0, 0},
+                                  1,
+                                  {1, 1},
+                                  padding_algorithm));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestConvBias(Place place, float abs_error = 2e-5) {
+  for (auto dims :
+       std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {5, 6, 7, 8}}) {
+    for (auto out_channels : {1, 3}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new ConvComputeTester(place,
+                                "def",
+                                DDim(dims),
+                                out_channels,
+                                3,
+                                {1, 1},
+                                {0, 0},
+                                1,
+                                {1, 1},
+                                "",
+                                true));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestConvAct(Place place, float abs_error = 2e-5) {
+  for (auto dims :
+       std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {5, 6, 7, 8}}) {
+    for (auto out_channels : {1, 3}) {
+      std::unique_ptr<arena::TestCase> tester0(
+          new ConvComputeTester(place,
+                                "def",
+                                DDim(dims),
+                                out_channels,
+                                3,
+                                {1, 1},
+                                {0, 0},
+                                1,
+                                {1, 1},
+                                "",
+                                false,
+                                true,
+                                "relu"));
+      arena::Arena arena0(std::move(tester0), place, abs_error);
+      arena0.TestPrecision();
+
+      std::unique_ptr<arena::TestCase> tester1(
+          new ConvComputeTester(place,
+                                "def",
+                                DDim(dims),
+                                out_channels,
+                                3,
+                                {1, 1},
+                                {0, 0},
+                                1,
+                                {1, 1},
+                                "",
+                                false,
+                                true,
+                                "leaky_relu",
+                                0.1));
+      arena::Arena arena1(std::move(tester1), place, abs_error);
+      arena1.TestPrecision();
+    }
+  }
+}
+
+TEST(Conv2d, precision) {
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 5e-2;  // Using fp16 in NPU
+#else
+  return;
+#endif
+
+  TestConvKsize(place, abs_error);
+  TestConvGroups(place, abs_error);
+  TestConvDilations(place, abs_error);
+  TestConvStrides(place, abs_error);
+  TestConvPaddings(place, abs_error);
+  TestConvPaddingAlgorithm(place, abs_error);
+  TestConvBias(place, abs_error);
+  TestConvAct(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/conv_transpose_compute_test.cc b/lite/tests/kernels/conv_transpose_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..584212e2cdac067798e61dc930b9c7dbb78136d7
--- /dev/null
+++ b/lite/tests/kernels/conv_transpose_compute_test.cc
@@ -0,0 +1,341 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+#include "lite/tests/utils/naive_math_impl.h"
+
+namespace paddle {
+namespace lite {
+
+class ConvTransposeComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "conv2d_transpose";
+  std::string input_ = "input";
+  std::string filter_ = "filter";
+  std::string output_ = "output";
+  DDim dims_;
+
+  int filter_channels_ = 1;
+  std::vector<int> ksize_{3, 3};
+  std::vector<int> strides_{1, 1};
+  std::vector<int> paddings_{0, 0};
+  int groups_ = 1;
+  std::vector<int> dilations_{1, 1};
+  std::string padding_algorithm_ = "";
+  std::vector<int> output_size_{};
+  std::string bias_ = "";
+  bool fuse_relu_ = false;
+
+ public:
+  ConvTransposeComputeTester(const Place& place,
+                             const std::string& alias,
+                             DDim dims,
+                             int filter_channels = 1,
+                             std::vector<int> ksize = {3, 3},
+                             std::vector<int> strides = {1, 1},
+                             std::vector<int> paddings = {0, 0},
+                             int groups = 1,
+                             std::vector<int> dilations = {1, 1},
+                             std::string padding_algorithm = "",
+                             std::vector<int> output_size = {},
+                             std::string bias = "",
+                             bool fuse_relu = false)
+      : TestCase(place, alias),
+        dims_(dims),
+        filter_channels_(filter_channels),
+        ksize_(ksize),
+        strides_(strides),
+        paddings_(paddings),
+        groups_(groups),
+        dilations_(dilations),
+        padding_algorithm_(padding_algorithm),
+        output_size_(output_size),
+        bias_(bias),
+        fuse_relu_(fuse_relu) {}
+
+  void RunBaseline(Scope* scope) override {
+    if (paddings_.size() == 2L) {
+      paddings_.insert(paddings_.begin(), paddings_[0]);
+      paddings_.insert(paddings_.begin() + 2, paddings_[2]);
+    }
+    CHECK_EQ(paddings_.size(), 4);
+
+    if (padding_algorithm_ == "SAME") {
+      for (size_t i = 0; i < strides_.size(); ++i) {
+        int out_size = (dims_[i + 2] + strides_[i] - 1) / strides_[i];
+        int pad_sum =
+            std::max((out_size - 1) * strides_[i] + ksize_[i] - dims_[i + 2],
+                     (int64_t)0);
+        int pad_0 = pad_sum / 2;
+        int pad_1 = pad_sum - pad_0;
+        // pad
+        paddings_[i * 2] = pad_0;
+        paddings_[i * 2 + 1] = pad_1;
+        // dilation
+        dilations_[i] = 1;
+      }
+    } else if (padding_algorithm_ == "VALID") {
+      for (auto& it : paddings_) {
+        it = 0;
+      }
+    }
+
+    std::vector<int64_t> output_shape{dims_[0], filter_channels_ * groups_};
+    for (size_t i = 0; i < strides_.size(); ++i) {
+      const int dkernel = dilations_[i] * (ksize_[i] - 1) + 1;
+      int output_size = (dims_[i + 2] - 1) * strides_[i] - paddings_[i * 2] -
+                        paddings_[i * 2 + 1] + dkernel;
+      output_shape.push_back(output_size);
+    }
+
+    if (!output_size_.empty()) {
+      for (size_t i = 0; i < output_size_.size(); ++i) {
+        output_shape[i + 2] = output_size_[i];
+      }
+    }
+    auto output = scope->NewTensor(output_);
+    output->Resize(output_shape);
+
+    const Tensor* input = scope->FindTensor(input_);
+    const Tensor* filter = scope->FindTensor(filter_);
+    const Tensor* bias = scope->FindTensor(bias_);
+    auto input_dims = input->dims();
+    auto filter_dims = filter->dims();
+    auto output_dims = output->dims();
+    auto input_data = input->data<float>();
+    auto filter_data = filter->data<float>();
+    auto output_data = output->mutable_data<float>();
+
+    bool flag_bias = bias != nullptr;
+    const float* bias_data = flag_bias ? bias->data<float>() : nullptr;
+    deconv_basic<float, float>(input_data,
+                               output_data,
+                               input_dims[0],
+                               output_dims[1],
+                               output_dims[2],
+                               output_dims[3],
+                               input_dims[1],
+                               input_dims[2],
+                               input_dims[3],
+                               filter_data,
+                               bias_data,
+                               groups_,
+                               filter_dims[3],
+                               filter_dims[2],
+                               strides_[1],
+                               strides_[0],
+                               dilations_[1],
+                               dilations_[0],
+                               paddings_[2],
+                               paddings_[3],
+                               paddings_[0],
+                               paddings_[1],
+                               flag_bias,
+                               fuse_relu_);
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("Input", {input_});
+    op_desc->SetInput("Filter", {filter_});
+    if (!bias_.empty()) {
+      op_desc->SetInput("Bias", {bias_});
+    }
+    op_desc->SetOutput("Output", {output_});
+    op_desc->SetAttr("strides", strides_);
+    op_desc->SetAttr("paddings", paddings_);
+    op_desc->SetAttr("groups", groups_);
+    op_desc->SetAttr("dilations", dilations_);
+    if (!padding_algorithm_.empty()) {
+      op_desc->SetAttr("padding_algorithm", padding_algorithm_);
+    }
+    if (!output_size_.empty()) {
+      op_desc->SetAttr("output_size", output_size_);
+    }
+    op_desc->SetAttr("fuse_relu", fuse_relu_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
+
+    DDim filter_dims(
+        std::vector<int64_t>{dims_[1], filter_channels_, ksize_[0], ksize_[1]});
+    std::vector<float> dfilter(filter_dims.production());
+    fill_data_rand(dfilter.data(), -1.f, 1.f, filter_dims.production());
+    SetCommonTensor(filter_, filter_dims, dfilter.data(), {}, true);
+
+    if (!bias_.empty()) {
+      DDim bias_dims(std::vector<int64_t>{filter_channels_ * groups_});
+      std::vector<float> dbias(bias_dims.production());
+      fill_data_rand(din.data(), -1.f, 1.f, bias_dims.production());
+      SetCommonTensor(bias_, bias_dims, dbias.data(), {}, true);
+    }
+  }
+};
+
+void TestConvTransposeKsize(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{5, 6, 11, 12}}) {
+    for (auto filter_channels : {1, 3}) {
+      for (auto ksize :
+           std::vector<std::vector<int>>{{1, 1}, {2, 2}, {3, 3}, {2, 3}}) {
+        std::unique_ptr<arena::TestCase> tester(new ConvTransposeComputeTester(
+            place, "def", DDim(dims), filter_channels, ksize));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestConvTransposeStrides(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{5, 6, 11, 12}}) {
+    for (auto strides : std::vector<std::vector<int>>{{2, 2}, {3, 3}, {1, 2}}) {
+      std::unique_ptr<arena::TestCase> tester(new ConvTransposeComputeTester(
+          place, "def", DDim(dims), 3, {3, 3}, strides));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestConvTransposePaddings(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{5, 6, 11, 12}}) {
+    for (auto paddings : std::vector<std::vector<int>>{
+             {1, 1}, {2, 2}, {0, 1}, {1, 0, 0, 1}, {1, 2, 0, 1}}) {
+      std::unique_ptr<arena::TestCase> tester(new ConvTransposeComputeTester(
+          place, "def", DDim(dims), 3, {3, 3}, {1, 1}, paddings));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestConvTransposeGroups(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{5, 6, 11, 12}}) {
+    for (auto groups : {2, 3, 6}) {
+      std::unique_ptr<arena::TestCase> tester(new ConvTransposeComputeTester(
+          place, "def", DDim(dims), 12, {3, 3}, {1, 1}, {0, 0}, groups));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestConvTransposeDilations(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{5, 6, 11, 12}}) {
+    for (auto dilations : std::vector<std::vector<int>>{{2, 2}, {1, 2}}) {
+      std::unique_ptr<arena::TestCase> tester(new ConvTransposeComputeTester(
+          place, "def", DDim(dims), 3, {3, 3}, {1, 1}, {0, 0}, 1, dilations));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestConvTransposePaddingAlgorithm(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{5, 6, 11, 12}}) {
+    for (auto padding_algorithm : std::vector<std::string>{"SAME", "VALID"}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new ConvTransposeComputeTester(place,
+                                         "def",
+                                         DDim(dims),
+                                         3,
+                                         {3, 3},
+                                         {2, 2},
+                                         {0, 0},
+                                         1,
+                                         {1, 1},
+                                         padding_algorithm));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestConvTransposeOutputSize(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{5, 6, 12, 12}}) {
+    for (auto output_size : std::vector<std::vector<int>>{{25, 26}, {26, 26}}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new ConvTransposeComputeTester(place,
+                                         "def",
+                                         DDim(dims),
+                                         3,
+                                         {3, 3},
+                                         {2, 2},
+                                         {0, 0},
+                                         1,
+                                         {1, 1},
+                                         "",
+                                         output_size));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestConvTransposeBiasRelu(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{5, 6, 11, 12}}) {
+    for (auto bias : std::vector<std::string>{"", "bias"}) {
+      for (bool fuse_relu : {true, false}) {
+        if (bias.empty() && fuse_relu) continue;
+        std::unique_ptr<arena::TestCase> tester(
+            new ConvTransposeComputeTester(place,
+                                           "def",
+                                           DDim(dims),
+                                           3,
+                                           {3, 3},
+                                           {1, 1},
+                                           {0, 0},
+                                           1,
+                                           {1, 1},
+                                           "",
+                                           {},
+                                           bias,
+                                           fuse_relu));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+TEST(Conv_transpose, precision) {
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 5e-2;  // Using fp16 in NPU
+#else
+  return;
+#endif
+
+  TestConvTransposeKsize(place, abs_error);
+  TestConvTransposeStrides(place, abs_error);
+  TestConvTransposePaddings(place, abs_error);
+  TestConvTransposeGroups(place, abs_error);
+  TestConvTransposeDilations(place, abs_error);
+  TestConvTransposePaddingAlgorithm(place, abs_error);
+  TestConvTransposeOutputSize(place, abs_error);
+  TestConvTransposeBiasRelu(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/dropout_compute_test.cc b/lite/tests/kernels/dropout_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..025f02ce31505cee684fb9a21c7b26d96e1c3026
--- /dev/null
+++ b/lite/tests/kernels/dropout_compute_test.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class DropoutComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string type_ = "dropout";
+  std::string x_ = "x";
+  std::string out_ = "out";
+  std::string mask_ = "mask";
+  DDim dims_{{1}};
+  float dropout_prob_ = 0.5;
+  bool fix_seed_ = true;
+  int seed_ = 1;
+  std::string dropout_implementation_ = "downgrade_in_infer";
+
+ public:
+  DropoutComputeTester(const Place& place,
+                       const std::string& alias,
+                       DDim dims,
+                       float dropout_prob,
+                       std::string dropout_implementation)
+      : TestCase(place, alias),
+        dims_(dims),
+        dropout_prob_(dropout_prob),
+        dropout_implementation_(dropout_implementation) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(out_);
+    CHECK(out);
+    out->Resize(dims_);
+    auto* output_data = out->mutable_data<float>();
+
+    auto* x = scope->FindTensor(x_);
+    const auto* x_data = x->data<float>();
+
+    if (dropout_implementation_ == "downgrade_in_infer") {
+      float rate = 1 - dropout_prob_;
+      for (int64_t i = 0; i < dims_.production(); i++) {
+        output_data[i] = x_data[i] * rate;
+      }
+    } else if (dropout_implementation_ == "upscale_in_train") {
+      memcpy(output_data, x_data, sizeof(float) * dims_.production());
+    } else {
+      LOG(FATAL) << "unsupported dropout_implementation: "
+                 << dropout_implementation_;
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(type_);
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetOutput("Mask", {mask_});
+    op_desc->SetAttr("dropout_prob", dropout_prob_);
+    op_desc->SetAttr("fix_seed", fix_seed_);
+    op_desc->SetAttr("seed", seed_);
+    op_desc->SetAttr("dropout_implementation", dropout_implementation_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> x(dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(x_, dims_, x.data());
+  }
+};
+
+TEST(Dropout, precision) {
+  LOG(INFO) << "test dropout op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {3}, {3, 4}, {3, 4, 5}, {1, 2, 3, 4}, {2, 3, 4, 5}}) {
+    for (auto dropout_prob : {0., 0.5, 1.}) {
+      for (auto dropout_implementation :
+           {"downgrade_in_infer", "upscale_in_train"}) {
+#ifdef LITE_WITH_NPU
+        if (dims.size() < 2) continue;
+#endif
+        std::unique_ptr<arena::TestCase> tester(new DropoutComputeTester(
+            place, "def", DDim(dims), dropout_prob, dropout_implementation));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision({"mask"});
+      }
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/elementwise_compute_test.cc b/lite/tests/kernels/elementwise_compute_test.cc
index 635f6e7c080c0565299ca416fc445637254d8a4e..66b6223dc45c8fa405d67be2f882ab8445644632 100644
--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc
@@ -16,641 +16,227 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
 
+#define ELT(MATHOP)                                                        \
+  for (int n = 0; n < xn; n++) {                                           \
+    for (int c = 0; c < xc; c++) {                                         \
+      for (int h = 0; h < xh; h++) {                                       \
+        for (int w = 0; w < xw; w++) {                                     \
+          int x_offset = n * xc * xh * xw + c * xh * xw + h * xw + w;      \
+          int y_offset = 0;                                                \
+          if (yn != 1) y_offset += n * yc * yh * yw;                       \
+          if (yc != 1) y_offset += c * yh * yw;                            \
+          if (yh != 1) y_offset += h * yw;                                 \
+          if (yw != 1) y_offset += w;                                      \
+          out_data[x_offset] = out_data[x_offset] MATHOP y_data[y_offset]; \
+        }                                                                  \
+      }                                                                    \
+    }                                                                      \
+  }
+
 class ElementwiseComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  DDim dims_{{1, 2, 3, 4}};
+  std::string x_ = "x";
+  std::string y_ = "y";
+  std::string out_ = "out";
+  // add, sub, mul, div, max
+  std::string elt_type_ = "";
+  DDim x_dims_{{1, 2, 3, 4}};
+  DDim y_dims_{{1, 2, 3, 4}};
+  int axis_ = 1;
+  std::string act_type_ = "";
 
  public:
   ElementwiseComputeTester(const Place& place,
                            const std::string& alias,
-                           int axis)
-      : TestCase(place, alias), axis_(axis) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] + y_data[i];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("elementwise_add");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class ElementwiseSubComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  ElementwiseSubComputeTester(const Place& place,
-                              const std::string& alias,
-                              int axis)
-      : TestCase(place, alias), axis_(axis) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] - y_data[i];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("elementwise_sub");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class ElementwiseMulComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  ElementwiseMulComputeTester(const Place& place,
-                              const std::string& alias,
-                              int axis)
-      : TestCase(place, alias), axis_(axis) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] * y_data[i];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("elementwise_mul");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class ElementwiseMaxComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  ElementwiseMaxComputeTester(const Place& place,
-                              const std::string& alias,
-                              int axis)
-      : TestCase(place, alias), axis_(axis) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = std::max(x_data[i], y_data[i]);
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("elementwise_max");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class FusionElementwiseAddActivationComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  std::string act_type_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  FusionElementwiseAddActivationComputeTester(const Place& place,
-                                              const std::string& alias,
-                                              int axis,
-                                              std::string act_type)
-      : TestCase(place, alias), axis_(axis), act_type_(act_type) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] + y_data[i];
-      if (act_type_ == "relu") {
-        out_data[i] = out_data[i] > 0 ? out_data[i] : 0;
-      } else {
-        LOG(FATAL) << "unsupported Activation type: " << act_type_;
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("fusion_elementwise_add_activation");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("act_type", act_type_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class FusionElementwiseSubActivationComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  std::string act_type_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  FusionElementwiseSubActivationComputeTester(const Place& place,
-                                              const std::string& alias,
-                                              int axis,
-                                              std::string act_type)
-      : TestCase(place, alias), axis_(axis), act_type_(act_type) {}
+                           std::string elt_type = "add",
+                           std::vector<int64_t> x_shape = {1, 2, 3, 4},
+                           std::vector<int64_t> y_shape = {1, 2, 3, 4},
+                           int axis = 1,
+                           std::string act_type = "")
+      : TestCase(place, alias),
+        elt_type_(elt_type),
+        x_dims_(DDim(x_shape)),
+        y_dims_(DDim(y_shape)),
+        axis_(axis),
+        act_type_(act_type) {}
 
   void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] - y_data[i];
-      if (act_type_ == "relu") {
-        out_data[i] = out_data[i] > 0 ? out_data[i] : 0;
-      } else {
-        LOG(FATAL) << "unsupported Activation type: " << act_type_;
+    if (axis_ < 0) {
+      axis_ = x_dims_.size() - y_dims_.size();
+    }
+    auto x_shape = x_dims_.Vectorize();
+    while (x_shape.size() < 4) {
+      x_shape.push_back(1);
+    }
+    auto y_shape = y_dims_.Vectorize();
+    y_shape.insert(y_shape.begin(), axis_, 1);
+    while (y_shape.size() < 4) {
+      y_shape.push_back(1);
+    }
+    CHECK_EQ(x_shape.size(), 4);
+    CHECK_EQ(y_shape.size(), 4);
+
+    auto x = scope->FindTensor(x_);
+    auto y = scope->FindTensor(y_);
+    auto x_data = x->data<float>();
+    auto y_data = y->data<float>();
+    auto out = scope->NewTensor(out_);
+    out->Resize(x_dims_);
+    auto out_data = out->mutable_data<float>();
+    memcpy(out_data, x_data, sizeof(float) * x_dims_.production());
+
+    int xn = x_shape[0];
+    int xc = x_shape[1];
+    int xh = x_shape[2];
+    int xw = x_shape[3];
+
+    int yn = y_shape[0];
+    int yc = y_shape[1];
+    int yh = y_shape[2];
+    int yw = y_shape[3];
+
+    if (elt_type_ == "add") {
+      ELT(+);
+    } else if (elt_type_ == "sub") {
+      ELT(-);
+    } else if (elt_type_ == "mul") {
+      ELT(*);
+    } else if (elt_type_ == "div") {
+      ELT(/);
+    } else if (elt_type_ == "max") {
+      for (int n = 0; n < xn; n++) {
+        for (int c = 0; c < xc; c++) {
+          for (int h = 0; h < xh; h++) {
+            for (int w = 0; w < xw; w++) {
+              int x_offset = n * xc * xh * xw + c * xh * xw + h * xw + w;
+              int y_offset = 0;
+              if (yn != 1) y_offset += n * yc * yh * yw;
+              if (yc != 1) y_offset += c * yh * yw;
+              if (yh != 1) y_offset += h * yw;
+              if (yw != 1) y_offset += w;
+              out_data[x_offset] =
+                  std::max(out_data[x_offset], y_data[y_offset]);
+            }
+          }
+        }
       }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("fusion_elementwise_sub_activation");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("act_type", act_type_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
+    } else {
+      LOG(FATAL) << "unsupported";
     }
 
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class FusionElementwiseMulActivationComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  std::string act_type_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  FusionElementwiseMulActivationComputeTester(const Place& place,
-                                              const std::string& alias,
-                                              int axis,
-                                              std::string act_type)
-      : TestCase(place, alias), axis_(axis), act_type_(act_type) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] * y_data[i];
+    if (!act_type_.empty()) {
       if (act_type_ == "relu") {
-        out_data[i] = out_data[i] > 0 ? out_data[i] : 0;
+        for (int i = 0; i < x_dims_.production(); i++) {
+          out_data[i] = std::max(0.f, out_data[i]);
+        }
       } else {
-        LOG(FATAL) << "unsupported Activation type: " << act_type_;
+        LOG(FATAL) << "unsupported";
       }
     }
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("fusion_elementwise_mul_activation");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
+    std::string op_type = "elementwise_" + elt_type_;
+    if (!act_type_.empty()) {
+      op_type = "fusion_" + op_type + "_activation";
+    }
+    op_desc->SetType(op_type);
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("Y", {y_});
+    op_desc->SetOutput("Out", {out_});
     op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("act_type", act_type_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class FusionElementwiseMaxActivationComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  std::string act_type_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  FusionElementwiseMaxActivationComputeTester(const Place& place,
-                                              const std::string& alias,
-                                              int axis,
-                                              std::string act_type)
-      : TestCase(place, alias), axis_(axis), act_type_(act_type) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = std::max(x_data[i], y_data[i]);
-      if (act_type_ == "relu") {
-        out_data[i] = out_data[i] > 0 ? out_data[i] : 0;
-      } else {
-        LOG(FATAL) << "unsupported Activation type: " << act_type_;
-      }
+    if (!act_type_.empty()) {
+      op_desc->SetAttr("act_type", act_type_);
     }
   }
 
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("fusion_elementwise_max_activation");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("act_type", act_type_);
-  }
-
   void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
+    std::vector<float> dx(x_dims_.production());
+    for (size_t i = 0; i < dx.size(); i++) {
+      dx[i] = (i % 3) * 1.1f;
+      dx[i] = dx[i] == 0 ? 1.f : dx[i];
     }
+    SetCommonTensor(x_, x_dims_, dx.data());
 
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class ElementwiseDivComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  ElementwiseDivComputeTester(const Place& place,
-                              const std::string& alias,
-                              int axis)
-      : TestCase(place, alias), axis_(axis) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = y->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] / y_data[i];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("elementwise_div");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
+    std::vector<float> dy(y_dims_.production());
+    for (size_t i = 0; i < dy.size(); i++) {
+      dy[i] = (i % 5) * 1.1f;
+      dy[i] = dy[i] == 0 ? 1.f : dy[i];
     }
-
-    std::vector<float> data2(dims_.production());
-    for (int i = 0; i < dims_.production(); i++) {
-      data2[i] = (i + 1) * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data2.data());
+    SetCommonTensor(y_, y_dims_, dy.data());
   }
 };
 
-class FusionElementwiseDivActivationComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  std::string act_type_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  FusionElementwiseDivActivationComputeTester(const Place& place,
-                                              const std::string& alias,
-                                              int axis,
-                                              std::string act_type)
-      : TestCase(place, alias), axis_(axis), act_type_(act_type) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = y->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] / y_data[i];
-      if (act_type_ == "relu") {
-        out_data[i] = out_data[i] > 0 ? out_data[i] : 0;
-      } else {
-        LOG(FATAL) << "unsupported Activation type: " << act_type_;
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("fusion_elementwise_div_activation");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("act_type", act_type_);
-  }
+// add sub mul div max   +act
+
+void TestElt(Place place,
+             float abs_error,
+             std::string elt_type,
+             std::vector<int64_t> x_shape,
+             std::vector<int64_t> y_shape,
+             int axis,
+             std::string act_type = "") {
+  std::unique_ptr<arena::TestCase> tester(new ElementwiseComputeTester(
+      place, "def", elt_type, x_shape, y_shape, axis, act_type));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
 
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
+void TestEltDims(Place place, float abs_error) {
+  TestElt(place, abs_error, "add", {2, 3, 4, 5}, {2, 3, 4, 5}, 0);
+  TestElt(place, abs_error, "add", {2, 3, 4}, {2, 3, 4}, 0);
+  TestElt(place, abs_error, "add", {2, 3, 4}, {2, 3}, 0);
+  TestElt(place, abs_error, "add", {2, 3}, {2}, 0);
+  TestElt(place, abs_error, "add", {2, 3, 4, 5}, {3, 4}, 1);
+  TestElt(place, abs_error, "add", {2, 3, 4}, {3, 4}, 1);
+  TestElt(place, abs_error, "add", {2, 3}, {3}, 1);
+  TestElt(place, abs_error, "add", {2, 3, 4, 5}, {4, 5}, 2);
+  TestElt(place, abs_error, "add", {2, 3, 4}, {4}, 2);
+  TestElt(place, abs_error, "add", {2, 3, 4, 5}, {5}, 3);
+  TestElt(place, abs_error, "add", {2, 3, 4, 5}, {3, 4, 5}, -1);
+  TestElt(place, abs_error, "add", {2, 3, 4}, {3, 4}, -1);
+}
 
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-    std::vector<float> data2(dims_.production());
-    for (int i = 0; i < dims_.production(); i++) {
-      data2[i] = (i + 1) * 1.1;
-    }
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data2.data());
+void TestEltTypes(Place place, float abs_error) {
+  for (auto elt_type :
+       std::vector<std::string>{"add", "sub", "mul", "div", "max"}) {
+    TestElt(place, abs_error, elt_type, {2, 3, 4, 5}, {2, 3, 4, 5}, 0);
+    TestElt(place, abs_error, elt_type, {2, 3, 4, 5}, {3}, 1);
   }
-};
-
-void test_elementwise(Place place) {
-  for (int axis : {-1, 0, 1, 3}) {
-    std::unique_ptr<arena::TestCase> tester(
-        new ElementwiseComputeTester(place, "def", axis));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_sub(
-        new ElementwiseSubComputeTester(place, "def", axis));
-    arena::Arena arena_sub(std::move(tester_sub), place, 2e-5);
-    arena_sub.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_mul(
-        new ElementwiseMulComputeTester(place, "def", axis));
-    arena::Arena arena_mul(std::move(tester_mul), place, 2e-5);
-    arena_mul.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_max(
-        new ElementwiseMaxComputeTester(place, "def", axis));
-    arena::Arena arena_max(std::move(tester_max), place, 2e-5);
-    arena_max.TestPrecision();
+}
 
-    std::unique_ptr<arena::TestCase> tester_div(
-        new ElementwiseDivComputeTester(place, "def", axis));
-    arena::Arena arena_div(std::move(tester_div), place, 2e-5);
-    arena_div.TestPrecision();
+void TestEltFuseAct(Place place, float abs_error) {
+  for (auto elt_type :
+       std::vector<std::string>{"add", "sub", "mul", "div", "max"}) {
+    TestElt(place, abs_error, elt_type, {2, 3, 4, 5}, {2, 3, 4, 5}, 0, "relu");
+    TestElt(place, abs_error, elt_type, {2, 3, 4, 5}, {3}, 1, "relu");
   }
 }
 
 TEST(Elementwise, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_elementwise(place);
+  Place place;
+  float abs_error = 2e-5;
+
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
 #endif
-}
-
-void test_fusion_elementwise(Place place) {
-  for (int axis : {-1, 0, 1, 3}) {
-    std::unique_ptr<arena::TestCase> tester_add_act(
-        new FusionElementwiseAddActivationComputeTester(
-            place, "def", axis, "relu"));
-    arena::Arena arena_add_act(std::move(tester_add_act), place, 2e-5);
-    arena_add_act.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_sub_act(
-        new FusionElementwiseSubActivationComputeTester(
-            place, "def", axis, "relu"));
-    arena::Arena arena_sub_act(std::move(tester_sub_act), place, 2e-5);
-    arena_sub_act.TestPrecision();
 
-    std::unique_ptr<arena::TestCase> tester_mul_act(
-        new FusionElementwiseMulActivationComputeTester(
-            place, "def", axis, "relu"));
-    arena::Arena arena_mul_act(std::move(tester_mul_act), place, 2e-5);
-    arena_mul_act.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_max_act(
-        new FusionElementwiseMaxActivationComputeTester(
-            place, "def", axis, "relu"));
-    arena::Arena arena_max_act(std::move(tester_max_act), place, 2e-5);
-    arena_max_act.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_div_act(
-        new FusionElementwiseDivActivationComputeTester(
-            place, "def", axis, "relu"));
-    arena::Arena arena_div_act(std::move(tester_div_act), place, 2e-5);
-    arena_div_act.TestPrecision();
-  }
-}
-
-TEST(FusionElementwise, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_fusion_elementwise(place);
-#endif
+  // TestEltDims(place, abs_error);
+  TestEltTypes(place, abs_error);
+  TestEltFuseAct(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/fc_compute_test.cc b/lite/tests/kernels/fc_compute_test.cc
index ef5baa81853294b45aa73c7911ad4e1b993a07d5..6d879385a27e834b3fa27835ee94edc599f5564c 100644
--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
@@ -18,11 +18,14 @@
 #include "lite/core/arena/framework.h"
 #include "lite/tests/utils/fill_data.h"
 #include "lite/tests/utils/naive_math_impl.h"
+#ifdef LITE_WITH_X86
+#include "lite/backends/x86/parallel.h"
+#endif
 
 namespace paddle {
 namespace lite {
 
-void fill_bias_fc(float* out, const float* bias, int num, int channel) {
+void AddBias(float* out, const float* bias, int num, int channel) {
   int remain = channel;
   for (int j = 0; j < num; ++j) {
     const float* ptr_bias = bias;
@@ -33,10 +36,17 @@ void fill_bias_fc(float* out, const float* bias, int num, int channel) {
   }
 }
 
-DDim compute_out_dim(const DDim& dim_in, const DDim& wdim, int in_num_col_dim) {
+void Relu(float* out, int num, int channel) {
+  for (int i = 0; i < num * channel; ++i) {
+    if (out[i] < 0) {
+      out[i] = 0;
+    }
+  }
+}
+
+DDim ComputeOutDim(const DDim& dim_in, const DDim& wdim, int in_num_col_dim) {
   std::vector<int64_t> out_dim;
   out_dim.resize(in_num_col_dim + 1);
-  auto in_mat_dims = dim_in.Flatten2D(in_num_col_dim);
   for (int i = 0; i < in_num_col_dim; ++i) {
     out_dim[i] = dim_in[i];
   }
@@ -49,12 +59,16 @@ class FcOPTest : public arena::TestCase {
   // common attributes for this op.
   std::string input_ = "x";
   std::string weight_ = "w";
+  std::string weight_padding_ = "w_padding";
   std::string bias_ = "b";
   std::string out_ = "out";
   DDim dims_{{1, 128}};
   DDim wdims_{{128, 4}};
+  DDim wdims_padding_;
   DDim bdims_{{4}};
   int in_num_col_dims_{1};
+  bool with_relu_{false};
+  bool padding_weights_{false};
 
  public:
   FcOPTest(const Place& place,
@@ -62,12 +76,22 @@ class FcOPTest : public arena::TestCase {
            DDim dim_in,
            DDim dim_w,
            DDim dim_b,
-           int in_num_col_dims)
+           int in_num_col_dims,
+           bool with_relu,
+           bool padding)
       : TestCase(place, alias),
         dims_(std::move(dim_in)),
         wdims_(std::move(dim_w)),
         bdims_(dim_b),
-        in_num_col_dims_(in_num_col_dims) {}
+        in_num_col_dims_(in_num_col_dims),
+        with_relu_(with_relu) {
+#ifdef LITE_WITH_X86
+    if (padding && wdims_[0] % 128 == 0 && wdims_[1] % 128 == 0) {
+      padding_weights_ = true;
+      wdims_padding_ = DDim({wdims_[0] + 4, wdims_[1] + 4});
+    }
+#endif
+  }
 
   void RunBaseline(Scope* scope) override {
     auto x = scope->FindTensor(input_);
@@ -76,11 +100,9 @@ class FcOPTest : public arena::TestCase {
     bool flag_bias = b;
     auto out = scope->NewTensor(out_);
     CHECK(out);
-    DDim out_dim = compute_out_dim(x->dims(), w->dims(), in_num_col_dims_);
+    DDim out_dim = ComputeOutDim(x->dims(), w->dims(), in_num_col_dims_);
     out->Resize(out_dim);
 
-    LOG(INFO) << "out dims: " << out_dim;
-
     auto x_data = x->data<float>();
     auto w_data = w->data<float>();
     const float* b_data = nullptr;
@@ -94,7 +116,9 @@ class FcOPTest : public arena::TestCase {
     int k = wdims_[0];
     int n = wdims_[1];
 
-    LOG(INFO) << "m: " << m << ", n: " << n << ", k: " << k;
+    LOG(INFO) << "M=" << m << ", N=" << n << ", K=" << k
+              << ", bias=" << flag_bias << ", with_relu=" << with_relu_
+              << ", padding_weights=" << padding_weights_;
 
     if (m == 1) {
       basic_gemv(n,
@@ -106,7 +130,7 @@ class FcOPTest : public arena::TestCase {
                  1.f,
                  0.f,
                  true,
-                 flag_bias,
+                 static_cast<int>(flag_bias),
                  false);
     } else {
       basic_gemm(false,
@@ -126,20 +150,34 @@ class FcOPTest : public arena::TestCase {
                  false,
                  false);
       if (flag_bias) {
-        fill_bias_fc(out_data, b_data, m, n);
+        AddBias(out_data, b_data, m, n);
       }
     }
+#ifdef LITE_WITH_X86
+    if (flag_bias && with_relu_) {
+      Relu(out_data, m, n);
+    }
+#endif
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("fc");
     op_desc->SetInput("Input", {input_});
-    op_desc->SetInput("W", {weight_});
+    if (padding_weights_) {
+      op_desc->SetInput("W", {weight_padding_});
+    } else {
+      op_desc->SetInput("W", {weight_});
+    }
     if (bdims_.production() > 0) {
       op_desc->SetInput("Bias", {bias_});
     }
     op_desc->SetOutput("Out", {out_});
     op_desc->SetAttr<int>("in_num_col_dims", in_num_col_dims_);
+#ifdef LITE_WITH_X86
+    std::string activation_type = with_relu_ ? "relu" : "";
+    op_desc->SetAttr<std::string>("activation_type", activation_type);
+    op_desc->SetAttr<bool>("padding_weights", padding_weights_);
+#endif
   }
 
   void PrepareData() override {
@@ -154,28 +192,45 @@ class FcOPTest : public arena::TestCase {
     fill_data_rand(bin.data(), -1.f, 1.f, bdims_.production());
 
     SetCommonTensor(input_, dims_, din.data());
-    SetCommonTensor(weight_, wdims_, win.data());
+    SetCommonTensor(weight_, wdims_, win.data(), {}, true);
+    if (padding_weights_) {
+      std::vector<float> win_padding(wdims_padding_.production());
+      for (int64_t i = 0; i < wdims_[0]; ++i) {
+        memcpy(&(win_padding[i * wdims_padding_[1]]),
+               &(win[i * wdims_[1]]),
+               wdims_[1] * sizeof(float));
+      }
+      SetCommonTensor(weight_padding_, wdims_padding_, win_padding.data());
+    }
     if (flag_bias) {
-      SetCommonTensor(bias_, bdims_, bin.data());
+      SetCommonTensor(bias_, bdims_, bin.data(), {}, true);
     }
   }
 };
 
-void test_fc(Place place) {
+void TestFC2D(Place place,
+              float abs_error,
+              bool with_relu = false,
+              bool padding = false) {
   for (auto& m : {1, 3, 16}) {
     for (auto& n : {1, 4, 16, 128, 256, 1024}) {
       for (auto& k : {1, 16, 128, 1024}) {
         for (auto& bflag : {false, true}) {
+          if (!bflag && with_relu) {
+            continue;
+          }
           DDim dim_in{{m, k}};
           DDim wdim{{k, n}};
           DDim bdim{{bflag ? n : 0}};
-          std::unique_ptr<arena::TestCase> tester(
-              new FcOPTest(place, "def", dim_in, wdim, bdim, 1));
+          std::unique_ptr<arena::TestCase> tester(new FcOPTest(
+              place, "def", dim_in, wdim, bdim, 1, with_relu, padding));
 #ifdef LITE_WITH_ARM
-          auto& ctx = tester->context()->As<ARMContext>();
-          ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1);
+          if (place == TARGET(kARM)) {
+            auto& ctx = tester->context()->As<ARMContext>();
+            ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1);
+          }
 #endif
-          arena::Arena arena(std::move(tester), place, 6e-5);
+          arena::Arena arena(std::move(tester), place, abs_error);
           if (!arena.TestPrecision()) {
             LOG(ERROR) << "run m: " << m << ", n: " << n << ", k: " << k
                        << ", bias: " << (bflag ? "true" : "false") << " failed";
@@ -187,15 +242,59 @@ void test_fc(Place place) {
   }
 }
 
+void TestFCHelper(Place place,
+                  float abs_error,
+                  std::vector<int64_t> xdims,
+                  std::vector<int64_t> wdims,
+                  std::vector<int64_t> bdims,
+                  int in_num_col_dims) {
+  std::unique_ptr<arena::TestCase> tester(new FcOPTest(place,
+                                                       "def",
+                                                       DDim(xdims),
+                                                       DDim(wdims),
+                                                       DDim(bdims),
+                                                       in_num_col_dims,
+                                                       false,
+                                                       false));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+void TestFCnD(Place place, float abs_error) {
+  TestFCHelper(place, abs_error, {2, 3, 4}, {4, 5}, {5}, 2);
+  TestFCHelper(place, abs_error, {2, 3, 4}, {12, 5}, {5}, 1);
+  TestFCHelper(place, abs_error, {2, 3, 4, 5}, {5, 6}, {6}, 3);
+  TestFCHelper(place, abs_error, {2, 3, 4, 5}, {20, 6}, {6}, 2);
+  TestFCHelper(place, abs_error, {2, 3, 4, 5}, {60, 6}, {6}, 1);
+}
+
 TEST(FcOP, precision) {
+  Place place;
+  float abs_error = 1e-4;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 2e-1;  // Using fp16 in NPU
+#elif defined(LITE_WITH_X86)
+  place = TARGET(kX86);
+  abs_error = 1e-4;
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+  TestFC2D(place, abs_error);
+  TestFCnD(place, abs_error);
+}
+
 #ifdef LITE_WITH_X86
+TEST(FcOP, padding_and_parallel) {
   Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_fc(place);
-#endif
+  float abs_error = 1e-4;
+  x86::SetNumThreads(4);
+  TestFC2D(place, abs_error, true, true);
 }
+#endif
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/tests/kernels/fill_constant_compute_test.cc b/lite/tests/kernels/fill_constant_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e211582b04d279b535f0d3873a9b0c537e375a60
--- /dev/null
+++ b/lite/tests/kernels/fill_constant_compute_test.cc
@@ -0,0 +1,178 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+class FillConstantComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string out_ = "out";
+  int dtype_{static_cast<int>(VarDescAPI::VarDataType::FP32)};
+  std::vector<int64_t> shape_{};
+  std::string shape_tensor_ = "ShapeTensor";
+  std::vector<std::string> shape_tensor_list_;
+  bool is_use_shape_tensor_{false};
+  bool is_use_shape_tensor_list_{false};
+
+  float value_{0.0f};
+  // useless for x86, keep it for compatibility
+  bool force_cpu_{false};
+  // DDim shape_tensor_data{{5, 3}};
+  std::vector<int32_t> shape_tensor_data;
+  DDim shape_test{{1, 2}};
+
+ public:
+  FillConstantComputeTester(const Place& place,
+                            const std::string& alias,
+                            std::vector<int64_t> shape,
+                            const bool is_use_shape_tensor,
+                            const bool is_use_shape_tensor_list,
+                            float value,
+                            bool force_cpu)
+      : TestCase(place, alias) {
+    shape_ = shape;
+    value_ = value;
+    force_cpu_ = force_cpu;
+    is_use_shape_tensor_ = is_use_shape_tensor;
+    is_use_shape_tensor_list_ = is_use_shape_tensor_list;
+
+    for (int i = 0; i < shape_test.size(); i++) {
+      shape_tensor_data.push_back(i + 1);
+    }
+  }
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(out_);
+    DDim output_dims{shape_};
+    if (is_use_shape_tensor_) {
+      auto* temp_shape = scope->FindTensor(shape_tensor_);
+      auto* shape_data = temp_shape->data<int>();
+      auto vec_shape =
+          std::vector<int64_t>(shape_data, shape_data + temp_shape->numel());
+      output_dims.ConstructFrom(vec_shape);
+    }
+    if (is_use_shape_tensor_list_) {
+      std::vector<int64_t> vec_shape;
+      for (int i = 0; i < shape_tensor_list_.size(); i++) {
+        auto* temp_shape = scope->FindTensor(shape_tensor_list_[i]);
+        vec_shape.push_back(*temp_shape->data<int>());
+      }
+
+      output_dims.ConstructFrom(vec_shape);
+    }
+    out->Resize(output_dims);
+
+    auto* output_data = out->mutable_data<float>();
+    for (int i = 0; i < out->numel(); i++) {
+      output_data[i] = value_;
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    LOG(INFO) << "PrepareOpDesc";
+
+    op_desc->SetType("fill_constant");
+    op_desc->SetAttr("dtype", dtype_);
+    op_desc->SetAttr("shape", shape_);
+    op_desc->SetAttr("value", value_);
+    op_desc->SetAttr("force_cpu", force_cpu_);
+    if (is_use_shape_tensor_) {
+      op_desc->SetInput("ShapeTensor", {shape_tensor_});
+    }
+    if (is_use_shape_tensor_list_) {
+      // std::vector<std::string> shape_tensor_list_;
+      for (int i = 0; i < shape_test.size(); ++i) {
+        shape_tensor_list_.push_back("shape_tensor_list_" + std::to_string(i));
+      }
+      op_desc->SetInput("ShapeTensorList", {shape_tensor_list_});
+    }
+    op_desc->SetOutput("Out", {out_});
+  }
+
+  void PrepareData() override {
+    if (is_use_shape_tensor_) {
+      // std::vector<int64_t> temp = x_dims_.data();
+      // int64_t* data = temp.data();
+      SetCommonTensor(shape_tensor_, shape_test, shape_tensor_data.data());
+    }
+    if (is_use_shape_tensor_list_) {
+      Scope& scope_ = this->scope();
+      for (int i = 0; i < shape_test.size(); ++i) {
+        auto* tensor =
+            scope_.NewTensor("shape_tensor_list_" + std::to_string(i));
+        tensor->Resize(DDim({1}));
+        auto* d = tensor->mutable_data<int>();
+        d[0] = shape_tensor_data[i];
+      }
+    }
+  }
+};
+
+TEST(fill_constant, precision) {
+  LOG(INFO) << "test fill_constant op, kARM";
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  std::vector<int64_t> shape{1, 2};
+
+  for (int dtype : {static_cast<int>(VarDescAPI::VarDataType::INT32)}) {
+    for (float value : {1, 2}) {
+      for (bool is_use_shape_tensor_list : {false, true}) {
+        for (bool is_use_shape_tensor : {false, true}) {
+          if (is_use_shape_tensor && is_use_shape_tensor_list) break;
+          LOG(INFO) << "value:" << value
+                    << ", is_use_shape_tensor:" << is_use_shape_tensor
+                    << ", is_use_shape_tensor_list:"
+                    << is_use_shape_tensor_list;
+
+          std::unique_ptr<arena::TestCase> tester(
+              new FillConstantComputeTester(place,
+                                            "def",
+                                            shape,
+                                            is_use_shape_tensor,
+                                            is_use_shape_tensor_list,
+                                            value,
+                                            false));
+          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena.TestPrecision();
+        }
+      }
+    }
+  }
+#endif
+
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  LOG(INFO) << "test concate op, x86";
+  for (int axis : {1, 2}) {
+    for (bool is_use_axis_tensor : {false, true}) {
+      LOG(INFO) << "axis:" << axis
+                << ", is_use_axis_tensor:" << is_use_axis_tensor;
+      std::unique_ptr<arena::TestCase> tester(
+          new ConcateComputeTester(place, "def", axis, is_use_axis_tensor));
+      arena::Arena arena(std::move(tester), place, 2e-5);
+      arena.TestPrecision();
+    }
+  }
+
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9db225b2cd9021565a4230d497fcd73f846ce9cb
--- /dev/null
+++ b/lite/tests/kernels/gather_compute_test.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class GatherComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "gather";
+  std::string x_ = "x";
+  std::string index_ = "index";
+  std::string out_ = "out";
+  DDim x_dims_{{5, 4, 2, 3}};
+  DDim index_dims_{{2, 1}};
+
+ public:
+  GatherComputeTest(const Place& place,
+                    const std::string& alias,
+                    const DDim& x_dims,
+                    const DDim& index_dims)
+      : TestCase(place, alias), x_dims_(x_dims), index_dims_(index_dims) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(x_);
+    auto index = scope->FindTensor(index_);
+    auto x_dims = x->dims();
+    auto index_dims = index->dims();
+    CHECK(index_dims.size() == 1 ||
+          (index_dims.size() == 2 && index_dims[1] == 1));
+
+    auto out = scope->NewTensor(out_);
+    CHECK(out);
+    int batch_size = index_dims[0];
+    DDim out_dims = x_dims;
+    out_dims[0] = batch_size;
+    out->Resize(out_dims);
+
+    auto x_data = x->data<float>();
+    auto index_data = index->data<int>();
+    auto out_data = out->mutable_data<float>();
+
+    auto slice_num = x_dims[0];
+    auto slice_size = x_dims.Slice(1, x_dims.size()).production();
+    for (int i = 0; i < batch_size; i++) {
+      auto index = index_data[i];
+      CHECK_LT(index, slice_num) << "gather index[i] expected < " << slice_num
+                                 << " but got " << index;
+      CHECK_GE(index, 0) << "gather ids[i] expected >= 0 but got " << index;
+      memcpy(out_data + i * slice_size,
+             x_data + index * slice_size,
+             slice_size * sizeof(float));
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("Index", {index_});
+    op_desc->SetOutput("Out", {out_});
+  }
+
+  void PrepareData() override {
+    std::vector<float> x(x_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+
+    std::vector<int32_t> index(index_dims_.production());
+    fill_data_rand<int32_t>(
+        index.data(), 0, x_dims_[0] - 1, index_dims_.production());
+
+    SetCommonTensor(x_, x_dims_, x.data());
+    SetCommonTensor(index_, index_dims_, index.data());
+  }
+};
+
+TEST(Gather, precision) {
+  LOG(INFO) << "test gather op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  for (auto x_dims :
+       std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
+    for (auto index_dims :
+         std::vector<std::vector<int64_t>>{{3, 1}, {7, 1}, {10, 1}}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new GatherComputeTest(place, "def", DDim(x_dims), DDim(index_dims)));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/grid_sampler_compute_test.cc b/lite/tests/kernels/grid_sampler_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec28ff9fa3040620d221ebb9f3dfd3caedd9858c
--- /dev/null
+++ b/lite/tests/kernels/grid_sampler_compute_test.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class GridSamplerComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "x";
+  std::string output_ = "y";
+  std::string grid_ = "grid";
+
+  DDim dims_{{4, 5, 19, 19}};
+
+ public:
+  GridSamplerComputeTest(const Place& place,
+                         const std::string& alias,
+                         DDim dims)
+      : TestCase(place, alias), dims_(dims) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(input_);
+    auto grid = scope->FindTensor(grid_);
+    auto out = scope->NewTensor(output_);
+    CHECK(out);
+    out->Resize(dims_);
+
+    const float* x_data = x->data<float>();
+    const float* grid_data = grid->data<float>();
+    float* out_data = out->mutable_data<float>();
+
+    int num = x->dims()[0];
+    int channel = x->dims()[1];
+    int height = x->dims()[2];
+    int width = x->dims()[3];
+    int spatial_size = height * width;
+
+    auto inbound = [](int x, int y, float x_max, float y_max) {
+      if (x < 0 || x > x_max || y < 0 || y > y_max) {
+        return false;
+      }
+      return true;
+    };
+
+    for (int n = 0; n < num; ++n) {
+      const float* x_n = x_data + n * channel * height * width;
+      float* out_n = out_data + n * channel * height * width;
+      const float* grid_n = grid_data + n * height * width * 2;
+      for (int c = 0; c < channel; ++c) {
+        const float* x_c = x_n + c * spatial_size;
+        float* out_c = out_n + c * spatial_size;
+        for (int s = 0; s < spatial_size; ++s) {
+          float x = grid_n[s * 2];
+          float y = grid_n[s * 2 + 1];
+          float xwf = (x + 1.f) * 0.5 * (width - 1);
+          float ynf = (y + 1.f) * 0.5 * (height - 1);
+          int xw = floor(xwf);
+          int xe = xw + 1;
+          int yn = floor(ynf);
+          int ys = yn + 1;
+
+          float dw = xwf - xw;
+          float de = xe - xwf;
+          float dn = ynf - yn;
+          float ds = ys - ynf;
+
+          float wn = inbound(xw,
+                             yn,
+                             static_cast<float>(width - 1),
+                             static_cast<float>(height - 1))
+                         ? x_c[yn * width + xw]
+                         : 0.f;
+          float en = inbound(xe,
+                             yn,
+                             static_cast<float>(width - 1),
+                             static_cast<float>(height - 1))
+                         ? x_c[yn * width + xe]
+                         : 0.f;
+          float ws = inbound(xw,
+                             ys,
+                             static_cast<float>(width - 1),
+                             static_cast<float>(height - 1))
+                         ? x_c[ys * width + xw]
+                         : 0.f;
+          float es = inbound(xe,
+                             ys,
+                             static_cast<float>(width - 1),
+                             static_cast<float>(height - 1))
+                         ? x_c[ys * width + xe]
+                         : 0.f;
+
+          out_c[s] = wn * de * ds + en * dw * ds + ws * de * dn + es * dw * dn;
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("grid_sampler");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetInput("Grid", {grid_});
+    op_desc->SetOutput("Output", {output_});
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+
+    DDim gird_dims{{dims_[0], dims_[2], dims_[3], 2}};
+    std::vector<float> grid(gird_dims.production());
+    fill_data_rand(grid.data(), -1.f, 1.f, gird_dims.production());
+
+    SetCommonTensor(input_, dims_, din.data());
+    SetCommonTensor(grid_, gird_dims, grid.data());
+  }
+};
+
+void test_grid_sampler(Place place) {
+  for (auto& n : {1, 13}) {
+    for (auto& c : {1, 3, 8}) {
+      for (auto& h : {1, 3, 8, 64}) {
+        for (auto& w : {2, 4, 9, 63}) {
+          DDim dim_in({n, c, h, w});
+          std::unique_ptr<arena::TestCase> tester(
+              new GridSamplerComputeTest(place, "def", dim_in));
+#ifdef LITE_WITH_ARM
+          auto& ctx = tester->context()->As<ARMContext>();
+          ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1);
+#endif
+          arena::Arena arena(std::move(tester), place, 6e-5);
+          LOG(INFO) << "run n: " << n << ", c: " << c << ", h: " << h
+                    << ", w: " << w;
+          if (!arena.TestPrecision()) {
+            LOG(ERROR) << "No Pass!!";
+            return;
+          }
+          // if you want to test this op performance, uncomment the following
+          // line
+          // arena.TestPerformance();
+        }
+      }
+    }
+  }
+}
+
+TEST(GridSampler, precision) {
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_grid_sampler(place);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/instance_norm_compute_test.cc b/lite/tests/kernels/instance_norm_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..adfe922098623084a5c4e95dfe735f9348bcee0b
--- /dev/null
+++ b/lite/tests/kernels/instance_norm_compute_test.cc
@@ -0,0 +1,176 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class InstanceNormComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "x";
+  std::string y_ = "y";
+  std::string saved_mean_ = "saved_mean";
+  std::string saved_variance_ = "saved_variance";
+  std::string scale_ = "scale";
+  std::string bias_ = "bias";
+
+  DDim dims_{{4, 5, 19, 19}};
+  float epsilon_ = 1e-5f;
+
+ public:
+  InstanceNormComputeTest(const Place& place,
+                          const std::string& alias,
+                          DDim dims,
+                          float epsilon)
+      : TestCase(place, alias), dims_(dims), epsilon_(epsilon) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(x_);
+    auto scale = scope->FindTensor(scale_);
+    auto bias = scope->FindTensor(bias_);
+    auto y = scope->NewTensor(y_);
+    auto saved_mean = scope->NewTensor(saved_mean_);
+    auto saved_variance = scope->NewTensor(saved_variance_);
+    CHECK(y);
+    CHECK(saved_mean);
+    CHECK(saved_variance);
+    DDim saved_dim({dims_[0] * dims_[1]});
+    y->Resize(dims_);
+    saved_mean->Resize(saved_dim);
+    saved_variance->Resize(saved_dim);
+
+    auto x_data = x->data<float>();
+    auto scale_data = scale->data<float>();
+    auto bias_data = bias->data<float>();
+    auto y_data = y->mutable_data<float>();
+    auto saved_mean_data = saved_mean->mutable_data<float>();
+    auto saved_variance_data = saved_variance->mutable_data<float>();
+
+    int n = x->dims()[0];
+    int c = x->dims()[1];
+    int spatial_size = x->dims()[2] * x->dims()[3];
+
+    // compute mean
+    for (int i = 0; i < n * c; ++i) {
+      const float* x_ptr = x_data + i * spatial_size;
+      float sum = 0.f;
+      for (int j = 0; j < spatial_size; ++j) {
+        sum += x_ptr[j];
+      }
+      saved_mean_data[i] = sum / spatial_size;
+    }
+    // compute variance
+    for (int i = 0; i < n * c; ++i) {
+      const float* x_ptr = x_data + i * spatial_size;
+      float sum = 0.f;
+      for (int j = 0; j < spatial_size; ++j) {
+        sum +=
+            (x_ptr[j] - saved_mean_data[i]) * (x_ptr[j] - saved_mean_data[i]);
+      }
+      saved_variance_data[i] = 1.f / sqrtf(sum / spatial_size + epsilon_);
+    }
+    // compute out
+    for (int i = 0; i < n * c; ++i) {
+      const float* x_ptr = x_data + i * spatial_size;
+      float* y_ptr = y_data + i * spatial_size;
+      float scale_val = scale_data[i % c];
+      float bias_val = bias_data[i % c];
+      for (int j = 0; j < spatial_size; ++j) {
+        y_ptr[j] = scale_val * (x_ptr[j] - saved_mean_data[i]) *
+                       saved_variance_data[i] +
+                   bias_val;
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("instance_norm");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("Bias", {bias_});
+    op_desc->SetInput("Scale", {scale_});
+    op_desc->SetOutput("Y", {y_});
+    op_desc->SetOutput("SavedMean", {saved_mean_});
+    op_desc->SetOutput("SavedVariance", {saved_variance_});
+    op_desc->SetAttr("epsilon", epsilon_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> x(dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, dims_.production());
+
+    DDim scale_bias_dims{{dims_[1]}};
+    std::vector<float> scale(scale_bias_dims.production());
+    fill_data_rand(scale.data(), -1.f, 1.f, scale_bias_dims.production());
+    std::vector<float> bias(scale_bias_dims.production());
+    fill_data_rand(bias.data(), -1.f, 1.f, scale_bias_dims.production());
+
+    SetCommonTensor(x_, dims_, x.data());
+    SetCommonTensor(scale_, scale_bias_dims, scale.data(), {}, true);
+    SetCommonTensor(bias_, scale_bias_dims, bias.data(), {}, true);
+  }
+};
+
+void TestInstanceNorm(Place place,
+                      float abs_error = 6e-5,
+                      std::vector<std::string> ignored_outs = {}) {
+  for (auto& n : {1, 3, 16}) {
+    for (auto& c : {1, 4, 16}) {
+      for (auto& h : {1, 16, 33, 56}) {
+        for (auto& w : {1, 17, 34, 55}) {
+          DDim dim_in({n, c, h, w});
+          float epsilon = 1e-5f;
+          std::unique_ptr<arena::TestCase> tester(
+              new InstanceNormComputeTest(place, "def", dim_in, epsilon));
+#ifdef LITE_WITH_ARM
+          if (place == TARGET(kARM)) {
+            auto& ctx = tester->context()->As<ARMContext>();
+            ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 4);
+          }
+#endif
+          arena::Arena arena(std::move(tester), place, abs_error);
+          if (!arena.TestPrecision(ignored_outs)) {
+            LOG(ERROR) << "run n: " << n << ", c: " << c << ", h: " << h
+                       << ", w: " << w;
+            return;
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(InstanceNorm, precision) {
+  Place place;
+  float abs_error = 6e-5;
+  std::vector<std::string> ignored_outs = {};
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+  ignored_outs = {"saved_mean", "saved_variance"};
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+  TestInstanceNorm(place, abs_error, ignored_outs);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/interp_compute_test.cc b/lite/tests/kernels/interp_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34e5f0fc9d9deee892521218e5155bf10af08b45
--- /dev/null
+++ b/lite/tests/kernels/interp_compute_test.cc
@@ -0,0 +1,462 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <string>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/core/tensor.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void ResizeNearestAlign(const lite::Tensor* x,
+                        lite::Tensor* out,
+                        bool with_align) {
+  auto x_dims = x->dims();
+  int num = x_dims[0];
+  int channels = x_dims[1];
+  int hin = x_dims[2];
+  int win = x_dims[3];
+  int hout = out->dims()[2];
+  int wout = out->dims()[3];
+  dtype scale_w = (with_align) ? (static_cast<float>(win - 1) / (wout - 1))
+                               : (static_cast<float>(win) / (wout));
+  dtype scale_h = (with_align) ? (static_cast<float>(hin - 1) / (hout - 1))
+                               : (static_cast<float>(hin) / (hout));
+  const dtype* src = x->data<dtype>();
+  dtype* dst = out->mutable_data<dtype>();
+  int dst_stride_w = 1;
+  int dst_stride_h = wout;
+  int dst_stride_c = wout * hout;
+  int dst_stride_batch = wout * hout * channels;
+  int src_stride_w = 1;
+  int src_stride_h = win;
+  int src_stride_c = win * hin;
+  int src_stride_batch = win * hin * channels;
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      int src_index = n * src_stride_batch + c * src_stride_c;
+      for (int h = 0; h < hout; ++h) {
+        for (int w = 0; w < wout; ++w) {
+          int fw = (with_align) ? static_cast<int>(scale_w * w + 0.5)
+                                : static_cast<int>(scale_w * w);
+          fw = (fw < 0) ? 0 : fw;
+          int fh = (with_align) ? static_cast<int>(scale_h * h + 0.5)
+                                : static_cast<int>(scale_h * h);
+          fh = (fh < 0) ? 0 : fh;
+          int w_start = static_cast<int>(fw);
+          int h_start = static_cast<int>(fh);
+          int dst_index = n * dst_stride_batch + c * dst_stride_c +
+                          h * dst_stride_h + w * dst_stride_w;
+          dst[dst_index] =
+              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
+        }
+      }
+    }
+  }
+}
+
+template <typename DType>
+void BilinearInterpRef(const lite::Tensor* x,
+                       lite::Tensor* out,
+                       bool align_corners,
+                       int align_mode) {
+  auto x_dims = x->dims();
+  int batch_size = x_dims[0];
+  int channel_size = x_dims[1];
+  auto x_h = x_dims[2];
+  auto x_w = x_dims[3];
+  CHECK_EQ(x_dims.size(), 4);
+
+  auto out_dims = out->dims();
+  int out_h = out_dims[2];
+  int out_w = out_dims[3];
+
+  // copy from x if no change
+  if (x_h == out_h && x_w == out_w) {
+    out->CopyDataFrom(*x);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(x_h - 1) / (out_h - 1)
+                              : static_cast<float>(x_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(x_w - 1) / (out_w - 1)
+                              : static_cast<float>(x_w) / out_w;
+  }
+
+  // naive bilinear interpolation
+  auto x_data = x->data<DType>();
+  auto out_data = out->mutable_data<DType>();
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vy_n, vy_s;
+  std::vector<float> vd_n, vd_s;
+  vy_n.reserve(out_h);
+  vy_s.reserve(out_h);
+  vd_n.reserve(out_h);
+  vd_s.reserve(out_h);
+  for (int k = 0; k < out_h; k++) {
+    int yn = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                        : static_cast<int>(ratio_h * k);
+    yn = (yn > 0) ? yn : 0;
+    int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn;
+    float ds = 1.f - dn;
+    {
+      vy_n[k] = yn;
+      vy_s[k] = ys;
+      vd_n[k] = dn;
+      vd_s[k] = ds;
+    }
+  }
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+  for (int l = 0; l < out_w; l++) {
+    int xw = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                        : static_cast<int>(ratio_w * l);
+    xw = (xw > 0) ? xw : 0;
+    int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1);
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw;
+    float de = 1.f - dw;
+    {
+      vx_w[l] = xw;
+      vx_e[l] = xe;
+      vd_w[l] = dw;
+      vd_e[l] = de;
+    }
+  }
+
+  std::vector<int64_t> x_strides(x_dims.size(), 1);
+  for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
+    x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
+  }
+  for (int i = 0; i < batch_size; i++) {
+    for (int j = 0; j < channel_size; j++) {
+      for (int k = 0; k < out_h; k++) {
+        for (int l = 0; l < out_w; l++) {
+          DType x0 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]];
+          DType x1 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]];
+          DType x2 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]];
+          DType x3 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]];
+          *out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] +
+                      x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l];
+          out_data++;
+        }
+      }
+    }
+  }
+}
+class NearestInterpComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "X";
+  std::string sizetensor0_ = "SizeTensor0";
+  std::string sizetensor1_ = "SizeTensor1";
+  std::string input_scale_ = "Scale";
+  std::string outsize_ = "OutSize";
+  std::string out_ = "Out";
+  DDim dims_{{1, 2, 3, 4}};
+
+  std::string interp_method_ = "nearest";
+  float scale_ = -1.f;
+  int out_h_ = -1;
+  int out_w_ = -1;
+  bool align_corners_ = true;
+  int align_mode_ = 1;
+  bool use_sizetensor_ = false;
+  bool use_input_scale_ = false;
+  bool use_outsize_ = false;
+
+ public:
+  NearestInterpComputeTester(const Place& place,
+                             const std::string& alias,
+                             DDim dims,
+                             std::string interp_method = "nearest",
+                             float scale = -1.f,
+                             int out_h = -1,
+                             int out_w = -1,
+                             bool align_corners = true,
+                             int align_mode = 1,
+                             bool use_sizetensor = false,
+                             bool use_input_scale = false,
+                             bool use_outsize = false)
+      : TestCase(place, alias),
+        dims_(dims),
+        interp_method_(interp_method),
+        scale_(scale),
+        out_h_(out_h),
+        out_w_(out_w),
+        align_corners_(align_corners),
+        align_mode_(align_mode),
+        use_sizetensor_(use_sizetensor),
+        use_input_scale_(use_input_scale),
+        use_outsize_(use_outsize) {}
+
+  void RunBaseline(Scope* scope) override {
+    int out_h = out_h_;
+    int out_w = out_w_;
+    if (scale_ > 0) {
+      out_h = dims_[2] * scale_;
+      out_w = dims_[3] * scale_;
+    }
+
+    auto input = scope->FindTensor(x_);
+    auto output = scope->NewTensor(out_);
+    std::vector<int64_t> out_shape{dims_[0], dims_[1], out_h, out_w};
+    output->Resize(out_shape);
+    if (interp_method_ == "nearest") {
+      ResizeNearestAlign<float>(input, output, align_corners_);
+    } else if (interp_method_ == "bilinear") {
+      BilinearInterpRef<float>(input, output, align_corners_, align_mode_);
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    if (interp_method_ == "nearest") {
+      op_desc->SetType("nearest_interp");
+    } else if (interp_method_ == "bilinear") {
+      op_desc->SetType("bilinear_interp");
+    } else {
+      LOG(FATAL) << "unsupport";
+    }
+    op_desc->SetInput("X", {x_});
+    if (use_sizetensor_) {
+      op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_});
+    }
+    if (use_input_scale_) {
+      op_desc->SetInput("Scale", {input_scale_});
+    }
+    if (use_outsize_) {
+      op_desc->SetInput("OutSize", {outsize_});
+    }
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("scale", scale_);
+    op_desc->SetAttr("out_h", out_h_);
+    op_desc->SetAttr("out_w", out_w_);
+    op_desc->SetAttr("align_corners", align_corners_);
+    op_desc->SetAttr("align_mode", align_mode_);
+    op_desc->SetAttr("interp_method", interp_method_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(x_, dims_, din.data());
+
+    if (use_sizetensor_) {
+      DDim sizetensor_dims(std::vector<int64_t>{1});
+      std::vector<int> dsizetensor0{out_h_};
+      std::vector<int> dsizetensor1{out_w_};
+      SetCommonTensor(
+          sizetensor0_, sizetensor_dims, dsizetensor0.data(), {}, true);
+      SetCommonTensor(
+          sizetensor1_, sizetensor_dims, dsizetensor1.data(), {}, true);
+    }
+
+    if (use_input_scale_) {
+      DDim input_scale_dims(std::vector<int64_t>{1});
+      std::vector<float> dinput_scale{scale_};
+      SetCommonTensor(
+          input_scale_, input_scale_dims, dinput_scale.data(), {}, true);
+    }
+
+    if (use_outsize_) {
+      DDim outsize_dims(std::vector<int64_t>{2});
+      std::vector<int> doutsize{out_h_, out_w_};
+      SetCommonTensor(outsize_, outsize_dims, doutsize.data(), {}, true);
+    }
+  }
+};
+
+void TestInterpOuthw(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (auto interp_method : std::vector<std::string>{"nearest", "bilinear"}) {
+      for (int out_h : {6, 8, 12}) {
+        for (int out_w : {6, 9, 12}) {
+          std::unique_ptr<arena::TestCase> tester(
+              new NearestInterpComputeTester(place,
+                                             "def",
+                                             DDim(x_dims),
+                                             interp_method,
+                                             -1.f,
+                                             out_h,
+                                             out_w));
+          arena::Arena arena(std::move(tester), place, abs_error);
+          arena.TestPrecision();
+        }
+      }
+    }
+  }
+}
+
+void TestInterpScale(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (auto interp_method : std::vector<std::string>{"nearest", "bilinear"}) {
+      for (float scale : {0.3f, 1.f, 1.7f}) {
+        std::unique_ptr<arena::TestCase> tester(new NearestInterpComputeTester(
+            place, "def", DDim(x_dims), interp_method, scale));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestInterpSizetensor(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (auto interp_method : std::vector<std::string>{"nearest", "bilinear"}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new NearestInterpComputeTester(place,
+                                         "def",
+                                         DDim(x_dims),
+                                         interp_method,
+                                         -1.f,
+                                         10,
+                                         12,
+                                         true,
+                                         1,
+                                         true,
+                                         false,
+                                         false));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestInterpInputScale(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (auto interp_method : std::vector<std::string>{"nearest", "bilinear"}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new NearestInterpComputeTester(place,
+                                         "def",
+                                         DDim(x_dims),
+                                         interp_method,
+                                         0.7,
+                                         -1,
+                                         -1,
+                                         true,
+                                         1,
+                                         false,
+                                         true,
+                                         false));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestInterpOutsize(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (auto interp_method : std::vector<std::string>{"nearest", "bilinear"}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new NearestInterpComputeTester(place,
+                                         "def",
+                                         DDim(x_dims),
+                                         interp_method,
+                                         -1,
+                                         4,
+                                         4,
+                                         true,
+                                         1,
+                                         false,
+                                         false,
+                                         true));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestInterpAlignCorners(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (bool align_corners : {true, false}) {
+      std::unique_ptr<arena::TestCase> tester(new NearestInterpComputeTester(
+          place, "def", DDim(x_dims), "nearest", 0.4, -1, -1, align_corners));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestInterpAlignMode(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (bool align_corners : {true, false}) {
+      for (int align_mode : {0, 1}) {
+        // may exist bug in arm kernel
+        if (place == TARGET(kARM) && align_mode == 1 && !align_corners) {
+          continue;
+        }
+        std::unique_ptr<arena::TestCase> tester(
+            new NearestInterpComputeTester(place,
+                                           "def",
+                                           DDim(x_dims),
+                                           "bilinear",
+                                           0.7,
+                                           -1,
+                                           -1,
+                                           align_corners,
+                                           align_mode));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+TEST(Interp, precision) {
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+  TestInterpOuthw(place, abs_error);
+  TestInterpScale(place, abs_error);
+  TestInterpSizetensor(place, abs_error);
+  TestInterpInputScale(place, abs_error);
+  TestInterpOutsize(place, abs_error);
+  TestInterpAlignCorners(place, abs_error);
+  TestInterpAlignMode(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/layer_norm_compute_test.cc b/lite/tests/kernels/layer_norm_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ea01a6cca504db230d62a63ef3a62d4f73470fa
--- /dev/null
+++ b/lite/tests/kernels/layer_norm_compute_test.cc
@@ -0,0 +1,181 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class LayerNormComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "layer_norm";
+  std::string x_ = "x";
+  std::string scale_ = "scale";
+  std::string bias_ = "bias";
+  std::string y_ = "y";
+  std::string mean_ = "mean";
+  std::string variance_ = "variance";
+  DDim dims_{{4, 5, 19, 19}};
+  float epsilon_ = 1e-5f;
+  int begin_norm_axis_ = 1;
+  bool has_bias_ = true;
+  bool has_scale_ = true;
+
+ public:
+  LayerNormComputeTest(const Place& place,
+                       const std::string& alias,
+                       DDim dims,
+                       float epsilon,
+                       int begin_norm_axis,
+                       bool has_bias,
+                       bool has_scale)
+      : TestCase(place, alias),
+        dims_(dims),
+        epsilon_(epsilon),
+        begin_norm_axis_(begin_norm_axis),
+        has_bias_(has_bias),
+        has_scale_(has_scale) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(x_);
+    auto scale = scope->FindTensor(scale_);
+    auto bias = scope->FindTensor(bias_);
+
+    auto y = scope->NewTensor(y_);
+    auto mean = scope->NewTensor(mean_);
+    auto variance = scope->NewTensor(variance_);
+    CHECK(y);
+    CHECK(mean);
+    CHECK(variance);
+    y->Resize(dims_);
+
+    auto matrix_dim = dims_.Flatten2D(begin_norm_axis_);
+    int batch_size = matrix_dim[0];
+    int feature_size = matrix_dim[1];
+    mean->Resize(std::vector<int64_t>{batch_size});
+    variance->Resize(std::vector<int64_t>{batch_size});
+
+    auto* x_data = x->data<float>();
+    auto* scale_data = (scale == nullptr ? nullptr : scale->data<float>());
+    auto* bias_data = (bias == nullptr ? nullptr : bias->data<float>());
+    auto* y_data = y->mutable_data<float>();
+    auto* mean_data = mean->mutable_data<float>();
+    auto* variance_data = variance->mutable_data<float>();
+
+    for (int i = 0; i < batch_size; ++i) {
+      int start = i * feature_size;
+      int end = start + feature_size;
+
+      float mean_t = 0;
+      float variance_t = 0;
+      for (int j = start; j < end; ++j) {
+        mean_t += x_data[j];
+        variance_t += x_data[j] * x_data[j];
+      }
+      mean_t /= feature_size;
+      variance_t = variance_t / feature_size - mean_t * mean_t;
+      mean_data[i] = mean_t;
+      variance_data[i] = variance_t;
+      variance_t = sqrt(variance_t + epsilon_);
+      for (int j = start; j < end; ++j) {
+        y_data[j] = (x_data[j] - mean_t) / variance_t;
+        if (scale_data) {
+          y_data[j] *= scale_data[j - start];
+        }
+        if (bias_data) {
+          y_data[j] += bias_data[j - start];
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {x_});
+    if (has_scale_) {
+      op_desc->SetInput("Scale", {scale_});
+    }
+    if (has_bias_) {
+      op_desc->SetInput("Bias", {bias_});
+    }
+    op_desc->SetOutput("Y", {y_});
+    op_desc->SetOutput("Mean", {mean_});
+    op_desc->SetOutput("Variance", {variance_});
+    op_desc->SetAttr("epsilon", epsilon_);
+    op_desc->SetAttr("begin_norm_axis", begin_norm_axis_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> x(dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(x_, dims_, x.data());
+
+    auto scale_bias_size =
+        dims_.Slice(begin_norm_axis_, dims_.size()).production();
+    if (has_scale_) {
+      DDim scale_dims({scale_bias_size});
+      std::vector<float> scale(scale_bias_size);
+      fill_data_rand(scale.data(), -1.f, 1.f, scale_bias_size);
+      SetCommonTensor(scale_, scale_dims, scale.data(), {}, true);
+    }
+    if (has_bias_) {
+      DDim bias_dims({scale_bias_size});
+      std::vector<float> bias(scale_bias_size);
+      fill_data_rand(bias.data(), -1.f, 1.f, scale_bias_size);
+      SetCommonTensor(bias_, bias_dims, bias.data(), {}, true);
+    }
+  }
+};
+
+TEST(LayerNorm, precision) {
+  LOG(INFO) << "test layer_norm op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#elif defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+  abs_error = 6e-5;
+#else
+  return;
+#endif
+
+  for (auto dims :
+       std::vector<std::vector<int64_t>>{{2, 3, 4, 5}, {3, 4, 5}, {4, 5}}) {
+    for (auto epsilon : {1e-5f}) {
+      for (auto axis : {1, 2, 3}) {
+        for (bool has_bias : {true, false}) {
+          for (bool has_scale : {true, false}) {
+            if (axis >= dims.size()) continue;
+            std::unique_ptr<arena::TestCase> tester(new LayerNormComputeTest(
+                place, "def", DDim(dims), epsilon, axis, has_bias, has_scale));
+            arena::Arena arena(std::move(tester), place, abs_error);
+            arena.TestPrecision({"mean", "variance"});
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5951601ef93a8cff2c007df5ed2f2645735b98e2
--- /dev/null
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class LookupTableComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "lookup_table";
+  std::string ids_ = "ids";
+  std::string w_ = "w";
+  std::string out_ = "out";
+  DDim ids_dims_{{2, 1}};
+  DDim w_dims_{{8, 4}};
+  int64_t padding_idx_ = -1;
+
+ public:
+  LookupTableComputeTest(const Place& place,
+                         const std::string& alias,
+                         const DDim& ids_dims,
+                         const DDim& w_dims,
+                         int64_t padding_idx)
+      : TestCase(place, alias),
+        ids_dims_(ids_dims),
+        w_dims_(w_dims),
+        padding_idx_(padding_idx) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto ids = scope->FindTensor(ids_);
+    auto w = scope->FindTensor(w_);
+    auto ids_dims = ids->dims();
+    auto w_dims = w->dims();
+
+    auto out = scope->NewTensor(out_);
+    CHECK(out);
+
+    int ids_rank = ids_dims.size();
+    CHECK_EQ(ids_dims[ids_rank - 1], 1);
+    CHECK_EQ(w_dims.size(), 2);
+
+    std::vector<int64_t> out_dims;
+    for (int i = 0; i < ids_rank - 1; ++i) {
+      out_dims.push_back(ids_dims[i]);
+    }
+    out_dims.push_back(w_dims[1]);
+    out->Resize(out_dims);
+    out->set_lod(ids->lod());
+
+    auto ids_data = ids->data<int64_t>();
+    auto ids_size = ids_dims.production();
+    auto w_data = w->data<float>();
+    auto w_rows = w_dims[0];
+    auto w_cols = w_dims[1];
+    auto out_data = out->mutable_data<float>();
+
+    for (int64_t i = 0; i < ids_size; i++) {
+      auto id = ids_data[i];
+      if (padding_idx_ != -1 && id == padding_idx_) {
+        memset(out_data + i * w_cols, 0, w_cols * sizeof(float));
+      } else {
+        CHECK_LT(id, w_rows) << "lookup_table ids[i] expected < " << w_rows
+                             << " but got " << id;
+        CHECK_GE(id, 0) << "lookup_table ids[i] expected >= 0 but got " << id;
+        memcpy(out_data + i * w_cols,
+               w_data + id * w_cols,
+               w_cols * sizeof(float));
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("Ids", {ids_});
+    op_desc->SetInput("W", {w_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr<int64_t>("padding_idx", padding_idx_);
+  }
+
+  void PrepareData() override {
+    std::vector<int64_t> ids(ids_dims_.production());
+    fill_data_rand<int64_t>(
+        ids.data(), 0, w_dims_[0] - 1, ids_dims_.production());
+
+    std::vector<float> w(w_dims_.production());
+    fill_data_rand(w.data(), -1.f, 1.f, w_dims_.production());
+
+    SetCommonTensor(ids_, ids_dims_, ids.data());
+    SetCommonTensor(w_, w_dims_, w.data());
+  }
+};
+
+TEST(LookupTable, precision) {
+  LOG(INFO) << "test lookup_table op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  for (auto ids_dims :
+       std::vector<std::vector<int64_t>>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) {
+    for (auto w_dims :
+         std::vector<std::vector<int64_t>>{{4, 2}, {6, 8}, {12, 15}}) {
+#if defined(LITE_WITH_XPU)
+      for (auto padding_idx :
+           std::vector<int64_t>{-1}) {  // Only -1 is supported by XPU
+#else
+      for (auto padding_idx : std::vector<int64_t>{-1, 0, w_dims[0] - 1}) {
+#endif
+        std::unique_ptr<arena::TestCase> tester(new LookupTableComputeTest(
+            place, "def", DDim(ids_dims), DDim(w_dims), padding_idx));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/matmul_compute_test.cc b/lite/tests/kernels/matmul_compute_test.cc
index 4915614b345c23119af37aa575bc07d4174fdcde..59b0fde8fd18b8a2170b6fdbd42444f09843f077 100644
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
@@ -16,6 +16,7 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -120,27 +121,27 @@ class MatMulComputeTester : public arena::TestCase {
   // common attributes for this op.
   std::string x_ = "X";
   std::string y_ = "Y";
-  bool x_transpose_;
-  bool y_transpose_;
-  float alpha_;
   std::string out_ = "Out";
   DDim x_dims_;
   DDim y_dims_;
+  bool x_transpose_;
+  bool y_transpose_;
+  float alpha_;
 
  public:
   MatMulComputeTester(const Place& place,
                       const std::string& alias,
-                      bool x_transpose,
-                      bool y_transpose,
-                      float alpha,
                       const DDim& x_dims,
-                      const DDim& y_dims)
+                      const DDim& y_dims,
+                      bool x_transpose = false,
+                      bool y_transpose = false,
+                      float alpha = 1.f)
       : TestCase(place, alias),
+        x_dims_(x_dims),
+        y_dims_(y_dims),
         x_transpose_(x_transpose),
         y_transpose_(y_transpose),
-        alpha_(alpha),
-        x_dims_(x_dims),
-        y_dims_(y_dims) {}
+        alpha_(alpha) {}
 
   void RunBaseline(Scope* scope) override {
     auto* x = scope->FindTensor(x_);
@@ -295,297 +296,279 @@ class MatMulComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
-    std::vector<float> x_data(x_dims_.production());
-    std::vector<float> y_data(y_dims_.production());
-
-    for (int i = 0; i < x_dims_.production(); ++i) {
-      x_data[i] = 1;  // i * 1.1;
-    }
-    for (int i = 0; i < y_dims_.production(); ++i) {
-      y_data[i] = 1;  // i * 0.9;
-    }
+    std::vector<float> x(x_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, x.data());
 
-    SetCommonTensor(x_, x_dims_, x_data.data());
-    SetCommonTensor(y_, y_dims_, y_data.data());
+    std::vector<float> y(y_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    SetCommonTensor(y_, y_dims_, y.data(), {}, true);
   }
 };
 
-void test_matmul2x2_no_transform(Place place) {
-  for (int m : {1, 2, 4, 8}) {
-    for (int k : {1, 3, 5}) {
-      for (int n : {1, 2, 4, 6}) {
+void test_matmul_helper(Place place,
+                        float abs_error,
+                        std::vector<int64_t> x_dims,
+                        std::vector<int64_t> y_dims,
+                        bool x_transpose,
+                        bool y_transpose,
+                        float alpha) {
+  std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(place,
+                                                                  "def",
+                                                                  DDim(x_dims),
+                                                                  DDim(y_dims),
+                                                                  x_transpose,
+                                                                  y_transpose,
+                                                                  alpha));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+void test_matmul2x2(Place place, float abs_error) {
+  for (int64_t m : {1, 2, 8}) {
+    for (int64_t k : {1, 3, 5}) {
+      for (int64_t n : {1, 4, 6}) {
         for (float alpha : {1., 2.}) {
-          bool x_transform = false;
-          bool y_transform = false;
-          std::unique_ptr<arena::TestCase> tester(
-              new MatMulComputeTester(place,
-                                      "def",
-                                      x_transform,
-                                      y_transform,
-                                      alpha,
-                                      DDim({m, k}),
-                                      DDim({k, n})));
-          arena::Arena arena(std::move(tester), place, 5e-4);
-          arena.TestPrecision();
+          test_matmul_helper(
+              place, abs_error, {m, k}, {k, n}, false, false, alpha);
         }
       }
     }
   }
 }
 
-void test_matmul2x2_x_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4}), DDim({2, 5})});
-  std::vector<DDim> y_dims({DDim({3, 2}), DDim({2, 1})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
+void test_matmul2x2_xtranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(place, abs_error, {3, 4}, {3, 2}, true, false, alpha);
+    test_matmul_helper(place, abs_error, {2, 5}, {2, 1}, true, false, alpha);
   }
 }
 
-void test_matmul2x2_y_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({5, 2}), DDim({2, 5})});
-  std::vector<DDim> y_dims({DDim({3, 2}), DDim({1, 5})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
+void test_matmul2x2_ytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(place, abs_error, {5, 2}, {3, 2}, false, true, alpha);
+    test_matmul_helper(place, abs_error, {2, 5}, {1, 5}, false, true, alpha);
   }
 }
 
-void test_matmul2x2_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({6, 2}), DDim({5, 3})});
-  std::vector<DDim> y_dims({DDim({3, 6}), DDim({1, 5})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 5e-5);
-    arena.TestPrecision();
+void test_matmul2x2_xytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(place, abs_error, {6, 2}, {3, 6}, true, true, alpha);
+    test_matmul_helper(place, abs_error, {5, 3}, {1, 5}, true, true, alpha);
   }
 }
 
-void test_matmul1x1_no_transpose(Place place) {
-  DDim x_dim({3});
-  DDim y_dim({3});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-void test_matmul1x1_transpose(Place place) {
-  DDim x_dim({3});
-  DDim y_dim({5});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", true, true, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
+void test_matmul1x1(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(place, abs_error, {3}, {3}, false, false, alpha);
+  }
 }
 
-void test_matmul_nx1(Place place) {
-  DDim x_dim({3, 4, 2, 5});
-  DDim y_dim({5});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
+void test_matmul1x1_xytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(place, abs_error, {3}, {5}, true, true, alpha);
+  }
 }
 
-void test_matmul_nx2_1(Place place) {
-  DDim x_dim({1, 2, 2, 3});
-  DDim y_dim({3, 1});
-  float alpha = 1.f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
+void test_matmulnx1(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 2, 5}, {5}, false, false, alpha);
+  }
 }
 
-void test_matmul_nx2_2(Place place) {
-  DDim x_dim({1, 2, 2, 3});
-  DDim y_dim({3, 3});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
+void test_matmulnx2(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {1, 2, 2, 3}, {3, 1}, false, false, alpha);
+    test_matmul_helper(
+        place, abs_error, {1, 2, 2, 3}, {3, 4}, false, false, alpha);
+  }
 }
 
-void test_matmulnx2_x_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({6, 2}), DDim({5, 1})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 2e-4);
-    arena.TestPrecision();
+void test_matmulnx2_xtranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 6, 2}, {6, 2}, true, false, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 3, 5, 2}, {5, 1}, true, false, alpha);
   }
 }
 
-void test_matmulnx2_y_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({6, 2}), DDim({1, 2})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 5e-5);
-    arena.TestPrecision();
+void test_matmulnx2_ytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 6, 2}, {5, 2}, false, true, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 3, 5, 2}, {1, 2}, false, true, alpha);
   }
 }
 
-void test_matmulnx2_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 4, 3}), DDim({5, 3, 3, 2})});
-  std::vector<DDim> y_dims({DDim({2, 4}), DDim({1, 3})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 5e-5);
-    arena.TestPrecision();
+void test_matmulnx2_xytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 4, 3}, {2, 4}, true, true, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 3, 3, 2}, {1, 3}, true, true, alpha);
   }
 }
 
-void test_matmul_nxn(Place place) {
-  DDim x_dim({3, 4, 2, 5});
-  DDim y_dim({3, 4, 5, 2});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 1e-3);
-  arena.TestPrecision();
+void test_matmulnxn(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 6, 2}, {3, 4, 2, 5}, false, false, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 3, 4}, {5, 4, 6}, false, false, alpha);
+  }
 }
 
-void test_matmulnxn_x_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 1})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 1e-3);
-    arena.TestPrecision();
+void test_matmulnxn_xtranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 2, 6}, {3, 4, 2, 5}, true, false, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 4, 2}, {5, 4, 6}, true, false, alpha);
   }
 }
 
-void test_matmulnxn_y_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 1, 2})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 1e-3);
-    arena.TestPrecision();
+void test_matmulnxn_ytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 6, 2}, {3, 4, 5, 2}, false, true, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 3, 4}, {5, 6, 4}, false, true, alpha);
   }
 }
 
-void test_matmulnxn_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 4, 3}), DDim({5, 3, 3, 2})});
-  std::vector<DDim> y_dims({DDim({3, 4, 2, 4}), DDim({5, 3, 1, 3})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 1e-3);
-    arena.TestPrecision();
+void test_matmulnxn_xytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 2, 6}, {3, 4, 5, 2}, true, true, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 4, 3}, {5, 6, 4}, true, true, alpha);
   }
 }
 
 TEST(Matmul2x2, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul2x2_no_transform(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
 #endif
+
+  test_matmul2x2(place, abs_error);
 }
 
 TEST(Matmul2x2_x_transpose, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul2x2_x_transpose(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmul2x2_xtranspose(place, abs_error);
 }
+
 TEST(Matmul2x2_y_transpose, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul2x2_y_transpose(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
 #endif
+
+  test_matmul2x2_ytranspose(place, abs_error);
 }
 
 TEST(Matmul2x2_transpose, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul2x2_transpose(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmul2x2_xytranspose(place, abs_error);
 }
 
 TEST(Matmul1x1, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul1x1_transpose(place);
-  test_matmul1x1_no_transpose(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmul1x1(place, abs_error);
+  test_matmul1x1_xytranspose(place, abs_error);
 }
 
 TEST(Matmulnx1, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul_nx1(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmulnx1(place, abs_error);
 }
 
 TEST(Matmulnx2, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul_nx2_1(place);
-  test_matmul_nx2_2(place);
-  test_matmulnx2_x_transpose(place);
-  test_matmulnx2_y_transpose(place);
-  test_matmulnx2_transpose(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmulnx2(place, abs_error);
+  test_matmulnx2_xtranspose(place, abs_error);
+  test_matmulnx2_ytranspose(place, abs_error);
+  test_matmulnx2_xytranspose(place, abs_error);
 }
 
 TEST(Matmulnxn, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul_nxn(place);
-  test_matmulnxn_x_transpose(place);
-  test_matmulnxn_y_transpose(place);
-  test_matmulnxn_transpose(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmulnxn(place, abs_error);
+  test_matmulnxn_xtranspose(place, abs_error);
+  test_matmulnxn_ytranspose(place, abs_error);
+  test_matmulnxn_xytranspose(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9bbfaa8d049cf2bbcdea9b9c5e58d201e156a67
--- /dev/null
+++ b/lite/tests/kernels/mul_compute_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class MulComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string type_ = "mul";
+  std::string x_ = "x";
+  std::string y_ = "y";
+  std::string out_ = "out";
+  DDim x_dims_{{1, 2}};
+  DDim y_dims_{{2, 1}};
+  int x_num_col_dims_{1};
+  int y_num_col_dims_{1};
+
+ public:
+  MulComputeTester(const Place& place,
+                   const std::string& alias,
+                   DDim x_dims,
+                   DDim y_dims,
+                   int x_num_col_dims,
+                   int y_num_col_dims)
+      : TestCase(place, alias),
+        x_dims_(x_dims),
+        y_dims_(y_dims),
+        x_num_col_dims_(x_num_col_dims),
+        y_num_col_dims_(y_num_col_dims) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* x = scope->FindTensor(x_);
+    auto* y = scope->FindTensor(y_);
+    auto x_mat_dims = x_dims_.Flatten2D(x_num_col_dims_);
+    auto y_mat_dims = y_dims_.Flatten2D(y_num_col_dims_);
+    CHECK_EQ(x_mat_dims[1], y_mat_dims[0]);
+
+    auto* out = scope->NewTensor(out_);
+    CHECK(out);
+    std::vector<int64_t> out_shape;
+    for (int i = 0; i < x_num_col_dims_; i++) {
+      out_shape.push_back(x_dims_[i]);
+    }
+    for (int i = y_num_col_dims_; i < y_dims_.size(); i++) {
+      out_shape.push_back(y_dims_[i]);
+    }
+    out->Resize(DDim(out_shape));
+
+    auto x_data = x->data<float>();
+    auto y_data = y->data<float>();
+    auto* out_data = out->mutable_data<float>();
+
+    const int M = x_mat_dims[0];
+    const int K = x_mat_dims[1];
+    const int N = y_mat_dims[1];
+    for (int m = 0; m < M; ++m) {
+      for (int n = 0; n < N; ++n) {
+        out_data[m * N + n] = 0;
+        for (int k = 0; k < K; ++k) {
+          out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n];
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(type_);
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("Y", {y_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("x_num_col_dims", x_num_col_dims_);
+    op_desc->SetAttr("y_num_col_dims", y_num_col_dims_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> x(x_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, x.data());
+
+    std::vector<float> y(y_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    SetCommonTensor(y_, y_dims_, y.data(), {}, true);
+  }
+};
+
+void TestMul(const std::vector<int64_t>& x_dims,
+             const std::vector<int64_t>& y_dims,
+             int x_num_col_dims,
+             int y_num_col_dims,
+             const Place& place,
+             float abs_error) {
+  std::unique_ptr<arena::TestCase> tester(new MulComputeTester(place,
+                                                               "def",
+                                                               DDim(x_dims),
+                                                               DDim(y_dims),
+                                                               x_num_col_dims,
+                                                               y_num_col_dims));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+TEST(Mul, precision) {
+  LOG(INFO) << "test mul op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  TestMul({4, 5}, {5, 4}, 1, 1, place, abs_error);
+  TestMul({4, 5}, {5, 4, 3, 2}, 1, 1, place, abs_error);
+  TestMul({4, 20}, {5, 4, 3, 2}, 1, 2, place, abs_error);
+  TestMul({4, 60}, {5, 4, 3, 2}, 1, 3, place, abs_error);
+  TestMul({2, 3, 4, 5}, {60, 4}, 1, 1, place, abs_error);
+  TestMul({2, 3, 4, 5}, {20, 4}, 2, 1, place, abs_error);
+  TestMul({2, 3, 4, 5}, {5, 4}, 3, 1, place, abs_error);
+  TestMul({2, 3, 4, 5}, {60, 3, 4, 5}, 1, 1, place, abs_error);
+  TestMul({2, 3, 4, 5}, {4, 5, 6, 2}, 2, 2, place, abs_error);
+  TestMul({2, 3, 4, 5}, {5, 1, 4, 2}, 3, 2, place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/nearest_interp_compute_test.cc b/lite/tests/kernels/nearest_interp_compute_test.cc
deleted file mode 100644
index 894959f9090cce8a391c146815f550d5f42adcb6..0000000000000000000000000000000000000000
--- a/lite/tests/kernels/nearest_interp_compute_test.cc
+++ /dev/null
@@ -1,260 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <string>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-inline std::vector<int> get_new_shape(
-    const std::vector<const lite::Tensor*>& list_new_shape_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_shape;
-  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
-    auto tensor = list_new_shape_tensor[i];
-    vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
-  }
-
-  return vec_new_shape;
-}
-
-template <typename T>
-inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
-  std::vector<T> vec_new_data;
-  auto* new_data = new_data_tensor->data<T>();
-  lite::Tensor cpu_starts_tensor;
-  vec_new_data =
-      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
-  return vec_new_data;
-}
-
-template <typename dtype>
-void resize_nearest_align(std::vector<const lite::Tensor*> inputs,
-                          lite::Tensor* output,
-                          bool with_align) {
-  int hin = inputs[0]->dims()[2];
-  int win = inputs[0]->dims()[3];
-  int channels = inputs[0]->dims()[1];
-  int num = inputs[0]->dims()[0];
-  int hout = output->dims()[2];
-  int wout = output->dims()[3];
-  dtype scale_w = (with_align) ? (static_cast<float>(win - 1) / (wout - 1))
-                               : (static_cast<float>(win) / (wout));
-  dtype scale_h = (with_align) ? (static_cast<float>(hin - 1) / (hout - 1))
-                               : (static_cast<float>(hin) / (hout));
-  const dtype* src = inputs[0]->data<dtype>();
-  dtype* dst = output->mutable_data<dtype>();
-  int dst_stride_w = 1;
-  int dst_stride_h = wout;
-  int dst_stride_c = wout * hout;
-  int dst_stride_batch = wout * hout * channels;
-  int src_stride_w = 1;
-  int src_stride_h = win;
-  int src_stride_c = win * hin;
-  int src_stride_batch = win * hin * channels;
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      int src_index = n * src_stride_batch + c * src_stride_c;
-      for (int h = 0; h < hout; ++h) {
-        for (int w = 0; w < wout; ++w) {
-          int fw = (with_align) ? static_cast<int>(scale_w * w + 0.5)
-                                : static_cast<int>(scale_w * w);
-          fw = (fw < 0) ? 0 : fw;
-          int fh = (with_align) ? static_cast<int>(scale_h * h + 0.5)
-                                : static_cast<int>(scale_h * h);
-          fh = (fh < 0) ? 0 : fh;
-          int w_start = static_cast<int>(fw);
-          int h_start = static_cast<int>(fh);
-          int dst_index = n * dst_stride_batch + c * dst_stride_c +
-                          h * dst_stride_h + w * dst_stride_w;
-          dst[dst_index] =
-              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
-        }
-      }
-    }
-  }
-}
-
-class NearestInterpComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input0_ = "X";
-  std::string sizetensor0_ = "SizeTensor0";
-  std::string sizetensor1_ = "SizeTensor1";
-  std::string input_scale_ = "Scale";
-  std::string input1_ = "OutSize";
-  std::string output_ = "Out";
-
-  float height_scale_ = 0.0f;
-  float width_scale_ = 0.0f;
-  int out_height_ = -1;
-  int out_width_ = -1;
-  bool align_corners_ = true;
-  std::string interp_method_ = "Nearest";
-  DDim dims_{{2, 3}};
-  DDim _dims0_{{2, 3, 3, 2}};
-  DDim _dims1_{{2}};
-  DDim sizetensor_dims_{{1}};
-  DDim scale_dims_{{1}};
-
- public:
-  NearestInterpComputeTester(const Place& place,
-                             const std::string& alias,
-                             float height_scale,
-                             float width_scale,
-                             int out_height,
-                             int out_width,
-                             bool align_corners,
-                             std::string interp_method)
-      : TestCase(place, alias),
-        height_scale_(height_scale),
-        width_scale_(width_scale),
-        out_height_(out_height),
-        out_width_(out_width),
-        align_corners_(align_corners),
-        interp_method_(interp_method) {}
-
-  void RunBaseline(Scope* scope) override {
-    width_scale_ = height_scale_;
-    auto* outputs = scope->NewTensor(output_);
-    CHECK(outputs);
-    outputs->Resize(dims_);
-    std::vector<const lite::Tensor*> inputs;
-    inputs.emplace_back(scope->FindTensor(input0_));
-    inputs.emplace_back(scope->FindTensor(input1_));
-
-    std::vector<const lite::Tensor*> SizeTensor(2);
-    SizeTensor[0] = scope->FindTensor(sizetensor0_);
-    SizeTensor[1] = scope->FindTensor(sizetensor1_);
-    const lite::Tensor* input_scale = scope->FindTensor(input_scale_);
-
-    float scale = height_scale_;
-    int in_h = inputs[0]->dims()[2];
-    int in_w = inputs[0]->dims()[3];
-    if (SizeTensor.size() > 0) {
-      auto new_size = get_new_shape(SizeTensor);
-      out_height_ = new_size[0];
-      out_width_ = new_size[1];
-    } else {
-      auto scale_tensor = input_scale;
-      if (scale_tensor != nullptr) {
-        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-        scale = scale_data[0];
-      }
-      if (scale > 0) {
-        out_height_ = static_cast<int>(in_h * scale);
-        out_width_ = static_cast<int>(in_w * scale);
-      }
-      auto out_size = inputs[1];
-      if (out_size != nullptr) {
-        auto out_size_data = get_new_data_from_tensor<int>(out_size);
-        out_height_ = out_size_data[0];
-        out_width_ = out_size_data[1];
-      }
-    }
-    height_scale_ = scale;
-    width_scale_ = scale;
-
-    if (out_width_ != -1 && out_height_ != -1) {
-      height_scale_ = static_cast<float>(out_height_ / inputs[0]->dims()[2]);
-      width_scale_ = static_cast<float>(out_width_ / inputs[0]->dims()[3]);
-    }
-    int num_cout = inputs[0]->dims()[0];
-    int c_cout = inputs[0]->dims()[1];
-    outputs->Resize({num_cout, c_cout, out_height_, out_width_});
-
-    resize_nearest_align<float>(inputs, outputs, align_corners_);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("nearest_interp");
-    op_desc->SetInput("X", {input0_});
-    op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_});
-    op_desc->SetInput("Scale", {input_scale_});
-    op_desc->SetInput("OutSize", {input1_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("scale", height_scale_);
-    op_desc->SetAttr("out_h", out_height_);
-    op_desc->SetAttr("out_w", out_width_);
-    op_desc->SetAttr("align_corners", align_corners_);
-    op_desc->SetAttr("interp_method", interp_method_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data0(_dims0_.production());
-    for (int i = 0; i < _dims0_.production(); i++) {
-      data0[i] = i * 1.1;
-    }
-
-    std::vector<int> data1(_dims1_.production());
-    for (int i = 0; i < _dims1_.production(); i++) {
-      data1[i] = (i + 1) * 2;
-    }
-
-    SetCommonTensor(input0_, _dims0_, data0.data());
-    SetCommonTensor(input1_, _dims1_, data1.data());
-
-    std::vector<int> sizetensor_data(1);
-    sizetensor_data[0] = out_height_;
-    SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data());
-
-    sizetensor_data[0] = out_width_;
-    SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data());
-
-    std::vector<float> scale_data(1);
-    scale_data[0] = height_scale_;
-    SetCommonTensor(input_scale_, scale_dims_, scale_data.data());
-  }
-};
-
-void test_nearest_interp(Place place) {
-  std::string interp_method = "Nearest";
-  for (float scale : {0.123, 2., 1.2}) {
-    for (int out_height : {2, 1, 6}) {
-      for (int out_width : {2, 3, 5}) {
-        for (bool align_corners : {true, false}) {
-          std::unique_ptr<arena::TestCase> tester(
-              new NearestInterpComputeTester(place,
-                                             "def",
-                                             scale,
-                                             scale,
-                                             out_height,
-                                             out_width,
-                                             align_corners,
-                                             interp_method));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-}
-
-TEST(NearestInterp, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_nearest_interp(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/pad2d_compute_test.cc b/lite/tests/kernels/pad2d_compute_test.cc
index 818e7d2e3b2bf7ba59f658d0545fbc255e332eaa..84a6328e8f64726414427290dca114c59c98c1f7 100644
--- a/lite/tests/kernels/pad2d_compute_test.cc
+++ b/lite/tests/kernels/pad2d_compute_test.cc
@@ -16,6 +16,7 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -23,8 +24,8 @@ namespace lite {
 class Pad2dComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
-  std::string input_ = "X";
-  std::string output_ = "Out";
+  std::string x_ = "X";
+  std::string out_ = "Out";
   DDim dims_{{1, 1, 14, 14}};
   std::string mode_{"constant"};
   std::vector<int> paddings_;
@@ -46,13 +47,13 @@ class Pad2dComputeTester : public arena::TestCase {
 
   void RunBaseline(Scope* scope) override {
     LOG(INFO) << "into runbase";
-    auto* out = scope->NewTensor(output_);
+    auto* out = scope->NewTensor(out_);
     CHECK(out);
     int out_h = dims_[2] + paddings_[0] + paddings_[1];
     int out_w = dims_[3] + paddings_[2] + paddings_[3];
     out->Resize(lite::DDim({dims_[0], dims_[1], out_h, out_w}));
     auto* out_data = out->mutable_data<float>();
-    auto* x = scope->FindTensor(input_);
+    auto* x = scope->FindTensor(x_);
     const auto* x_data = x->data<float>();
     LOG(INFO) << "get nums";
 
@@ -125,8 +126,8 @@ class Pad2dComputeTester : public arena::TestCase {
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("pad2d");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
     op_desc->SetAttr("mode", mode_);
     op_desc->SetAttr("pad_value", pad_value_);
     op_desc->SetAttr("paddings", paddings_);
@@ -134,17 +135,13 @@ class Pad2dComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
+    std::vector<float> x(dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(x_, dims_, x.data());
   }
 };
 
-void TestPad2d(const Place& place) {
+void TestPad2d(const Place& place, float abs_error = 2e-5) {
   std::string data_format = "NCHW";
   for (int pad_top : {0, 1}) {
     for (int pad_bottom : {0, 1}) {
@@ -158,7 +155,7 @@ void TestPad2d(const Place& place) {
                         << paddings[2] << " " << paddings[3];
               std::unique_ptr<arena::TestCase> tester(new Pad2dComputeTester(
                   place, "def", pad_mode, paddings, pad_value, data_format));
-              arena::Arena arena(std::move(tester), place, 2e-5);
+              arena::Arena arena(std::move(tester), place, abs_error);
               arena.TestPrecision();
             }
           }
@@ -169,13 +166,17 @@ void TestPad2d(const Place& place) {
 }
 
 TEST(Scale, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  TestPad2d(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+  TestPad2d(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/pool_compute_test.cc b/lite/tests/kernels/pool_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d94c2e5154b88e9455c1c3cf8d937d13e825a858
--- /dev/null
+++ b/lite/tests/kernels/pool_compute_test.cc
@@ -0,0 +1,367 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class PoolComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "pool2d";
+  std::string x_ = "x";
+  std::string out_ = "out";
+  DDim dims_{{1, 2, 3, 4}};
+  std::string pooling_type_ = "max";
+  bool global_pooling_ = false;
+  std::vector<int> strides_{1, 1};
+  std::vector<int> paddings_{0, 0};
+  std::vector<int> ksize_{2, 2};
+  bool exclusive_ = true;
+  bool ceil_mode_ = false;
+  bool adaptive_ = false;
+  std::string padding_algorithm_;
+
+ public:
+  PoolComputeTest(const Place& place,
+                  const std::string& alias,
+                  DDim dims,
+                  std::string pooling_type,
+                  bool global_pooling,
+                  std::vector<int> strides = {1, 1},
+                  std::vector<int> paddings = {0, 0},
+                  std::vector<int> ksize = {2, 2},
+                  bool exclusive = true,
+                  bool ceil_mode = false,
+                  bool adaptive = false,
+                  std::string padding_algorithm = "")
+      : TestCase(place, alias),
+        dims_(dims),
+        pooling_type_(pooling_type),
+        global_pooling_(global_pooling),
+        strides_(strides),
+        paddings_(paddings),
+        ksize_(ksize),
+        exclusive_(exclusive),
+        ceil_mode_(ceil_mode),
+        adaptive_(adaptive) {}
+
+  void RunBaseline(Scope* scope) override {
+    std::vector<int> paddings_new{paddings_};
+    if (paddings_new.size() == 1L) {
+      paddings_new = std::vector<int>(4, paddings_new[0]);
+    } else if (paddings_new.size() == 2L) {
+      paddings_new.insert(paddings_new.begin(), paddings_new[0]);
+      paddings_new.insert(paddings_new.begin() + 2, paddings_new[2]);
+    }
+    CHECK_EQ(paddings_new.size(), 4L);
+    if (padding_algorithm_ == "SAME") {
+      for (int i = 0; i < strides_.size(); ++i) {
+        int out_size = (dims_[i + 2] + strides_[i] - 1) / strides_[i];
+        int pad_sum =
+            std::max((out_size - 1) * strides_[i] + ksize_[i] - dims_[i + 2],
+                     (int64_t)0);
+        int pad_0 = pad_sum / 2;
+        int pad_1 = pad_sum - pad_0;
+        *(paddings_new.begin() + i * 2) = pad_0;
+        *(paddings_new.begin() + i * 2 + 1) = pad_1;
+      }
+    }
+    if (padding_algorithm_ == "VALID" || global_pooling_ || adaptive_) {
+      for (size_t i = 0; i < paddings_new.size(); i++) {
+        paddings_new[i] = 0;
+      }
+    }
+
+    std::vector<int> ksize_new{ksize_};
+    if (global_pooling_) {
+      ksize_new.clear();
+      ksize_new.push_back(dims_[2]);
+      ksize_new.push_back(dims_[3]);
+    }
+
+    std::vector<int64_t> out_shape{dims_[0], dims_[1]};
+    if (adaptive_) {
+      out_shape.insert(out_shape.end(), ksize_new.begin(), ksize_new.end());
+    } else {
+      for (size_t i = 0; i < ksize_new.size(); ++i) {
+        int out_size;
+        if (!ceil_mode_) {
+          out_size = (dims_[i + 2] - ksize_new[i] + paddings_new[2 * i] +
+                      paddings_new[2 * i + 1]) /
+                         strides_[i] +
+                     1;
+        } else {
+          out_size = (dims_[i + 2] - ksize_new[i] + paddings_new[2 * i] +
+                      paddings_new[2 * i + 1] + strides_[i] - 1) /
+                         strides_[i] +
+                     1;
+        }
+        out_shape.push_back(out_size);
+      }
+    }
+
+    auto out = scope->NewTensor(out_);
+    CHECK(out);
+    out->Resize(DDim(out_shape));
+    auto out_dims = out->dims();
+    auto dst_ptr = out->mutable_data<float>();
+
+    auto x = scope->FindTensor(x_);
+    auto src_ptr = x->data<float>();
+
+    int in_n = dims_[0];
+    int in_c = dims_[1];
+    int in_h = dims_[2];
+    int in_w = dims_[3];
+    int size_in_n = in_c * in_h * in_w;
+    int size_in_c = in_h * in_w;
+
+    int out_h = out_dims[2];
+    int out_w = out_dims[3];
+    int size_out_n = in_c * out_h * out_w;
+    int size_out_c = out_h * out_w;
+
+    int window_h = ksize_new[0];
+    int window_w = ksize_new[1];
+    int stride_h = strides_[0];
+    int stride_w = strides_[1];
+    int pad_t = paddings_new[0];
+    int pad_l = paddings_new[2];
+
+    if (global_pooling_) {
+      for (int n = 0; n < in_n; ++n) {
+        for (int c = 0; c < in_c; ++c) {
+          const float* src = src_ptr + n * size_in_n + c * size_in_c;
+          float res = src[0];
+          if (pooling_type_ == "max") {
+            for (int i = 1; i < size_in_c; ++i) {
+              float cur_val = src[i];
+              res = cur_val > res ? cur_val : res;
+            }
+          } else if (pooling_type_ == "avg") {
+            for (int i = 1; i < size_in_c; ++i) {
+              float cur_val = src[i];
+              res += cur_val;
+            }
+            res /= size_in_c;
+          }
+          dst_ptr[n * size_out_n + c] = res;
+        }
+      }
+    } else {
+      for (int n = 0; n < in_n; ++n) {
+        for (int c = 0; c < in_c; ++c) {
+          for (int h = 0; h < out_h; ++h) {
+            int sh = h * stride_h;
+            int eh = sh + window_h;
+            sh = (sh - pad_t) < 0 ? 0 : sh - pad_t;
+            eh = (eh - pad_t) > in_h ? in_h : eh - pad_t;
+            for (int w = 0; w < out_w; ++w) {
+              int sw = w * stride_w;
+              int ew = sw + window_w;
+              sw = (sw - pad_l) < 0 ? 0 : sw - pad_l;
+              ew = (ew - pad_l) > in_w ? in_w : ew - pad_l;
+              int pooling_size = (ew - sw) * (eh - sh);
+              if (pooling_size == 0) continue;
+              float res = 0.f;
+              for (int kh = sh; kh < eh; ++kh) {
+                for (int kw = sw; kw < ew; ++kw) {
+                  int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
+                  if (kh == sh && kw == sw) {
+                    res = src_ptr[src_idx];
+                  } else {
+                    if (pooling_type_ == "max") {
+                      res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
+                    }
+                    if (pooling_type_ == "avg") {
+                      res += src_ptr[src_idx];
+                    }
+                  }
+                }
+              }
+              if (pooling_type_ == "avg") {
+                if (exclusive_) {
+                  res /= pooling_size;
+                } else {
+                  res /= window_h * window_w;
+                }
+              }
+              dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("pooling_type", pooling_type_);
+    op_desc->SetAttr("global_pooling", global_pooling_);
+    op_desc->SetAttr("strides", strides_);
+    op_desc->SetAttr("paddings", paddings_);
+    op_desc->SetAttr("ksize", ksize_);
+    op_desc->SetAttr("exclusive", exclusive_);
+    op_desc->SetAttr("ceil_mode", ceil_mode_);
+    op_desc->SetAttr("adaptive", adaptive_);
+    if (!padding_algorithm_.empty()) {
+      op_desc->SetAttr("padding_algorithm", padding_algorithm_);
+    }
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(x_, dims_, din.data());
+  }
+};
+
+void TestPoolGlobal(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{2, 3, 4, 5}}) {
+    for (std::string pooling_type : {"max", "avg"}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new PoolComputeTest(place, "def", DDim(dims), pooling_type, true));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestPoolAlgorithm(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{2, 3, 4, 5}}) {
+    for (auto pooling_type : {"max", "avg"}) {
+      for (auto padding_algorithm : {"SAME", "VALID"}) {
+        std::unique_ptr<arena::TestCase> tester(
+            new PoolComputeTest(place,
+                                "def",
+                                DDim(dims),
+                                pooling_type,
+                                false,
+                                {2, 2},
+                                {0, 0},
+                                {2, 2},
+                                true,
+                                false,
+                                false,
+                                padding_algorithm));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestPoolHelper(Place place,
+                    float abs_error,
+                    std::vector<int64_t> dims,
+                    std::string pooling_type,
+                    std::vector<int> strides,
+                    std::vector<int> paddings,
+                    std::vector<int> ksize) {
+  std::unique_ptr<arena::TestCase> tester(new PoolComputeTest(
+      place, "def", DDim(dims), pooling_type, false, strides, paddings, ksize));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+void TestPoolStrides(Place place, float abs_error = 2e-5) {
+  for (auto pooling_type : {"max", "avg"}) {
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {0, 0}, {2, 2});
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 2}, {0, 0}, {2, 2});
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {2, 2}, {0, 0}, {2, 2});
+  }
+}
+
+void TestPoolPaddings(Place place, float abs_error = 2e-5) {
+  for (auto pooling_type : {"max", "avg"}) {
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {0, 0}, {2, 2});
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {1, 1}, {2, 2});
+    TestPoolHelper(place,
+                   abs_error,
+                   {2, 3, 6, 7},
+                   pooling_type,
+                   {1, 1},
+                   {0, 0, 1, 1},
+                   {2, 2});
+    TestPoolHelper(place,
+                   abs_error,
+                   {2, 3, 6, 7},
+                   pooling_type,
+                   {1, 1},
+                   {1, 0, 1, 0},
+                   {2, 2});
+    TestPoolHelper(place,
+                   abs_error,
+                   {2, 3, 6, 7},
+                   pooling_type,
+                   {1, 1},
+                   {1, 0, 0, 1},
+                   {2, 2});
+  }
+}
+
+void TestPoolKsize(Place place, float abs_error = 2e-5) {
+  for (auto pooling_type : {"max", "avg"}) {
+    for (auto ksize : {2, 3}) {
+      TestPoolHelper(place,
+                     abs_error,
+                     {2, 3, 6, 7},
+                     pooling_type,
+                     {1, 1},
+                     {0, 0},
+                     {ksize, ksize});
+      TestPoolHelper(place,
+                     abs_error,
+                     {2, 3, 6, 7},
+                     pooling_type,
+                     {2, 2},
+                     {1, 1},
+                     {ksize, ksize});
+    }
+  }
+}
+
+TEST(Pool, precision) {
+  LOG(INFO) << "test pool op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#else
+  return;
+#endif
+
+  TestPoolGlobal(place, abs_error);
+  TestPoolAlgorithm(place, abs_error);
+  TestPoolStrides(place, abs_error);
+  TestPoolPaddings(place, abs_error);
+  TestPoolKsize(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/reduce_prod_compute_test.cc b/lite/tests/kernels/reduce_prod_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27cfea6f5496b5346134e756fea10fd23b90663e
--- /dev/null
+++ b/lite/tests/kernels/reduce_prod_compute_test.cc
@@ -0,0 +1,342 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+void reduce_prod_n(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = 1.0;
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+void reduce_prod_c(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = 1.0;
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+void reduce_prod_h(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = 1.0;
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+void reduce_prod_w(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = 1.0;
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+void reduce_prod_all(const float* src, float* dst, int64_t total_num) {
+  dst[0] = 1.0;
+  for (int64_t n = 0; n < total_num; ++n) {
+    dst[0] *= src[n];
+  }
+}
+
+void reduce_prod_nc(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_prod_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+
+void reduce_prod_ch(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_prod_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+
+void reduce_prod_hw(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_prod_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+
+class ReduceProdComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "x";
+  std::string output_ = "out";
+  std::vector<int> dim_{};
+  bool keep_dim_{};
+  DDim x_dims_{};
+  bool reduce_all_{};
+
+ public:
+  ReduceProdComputeTester(const Place& place,
+                          const std::string& alias,
+                          std::vector<int> dim,
+                          bool keep_dim,
+                          DDim x_dims,
+                          bool reduce_all)
+      : TestCase(place, alias),
+        dim_(dim),
+        keep_dim_(keep_dim),
+        x_dims_(x_dims),
+        reduce_all_(reduce_all) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* x = scope->FindMutableTensor(input_);
+    auto* x_data = x->data<float>();
+    auto x_rank = x_dims_.size();
+    auto* out = scope->NewTensor(output_);
+
+    if (!dim_.empty()) {
+      for (size_t i = 0; i < dim_.size(); i++) {
+        if (dim_[i] < 0) {
+          dim_[i] += x_rank;
+        }
+      }
+    }
+    sort(dim_.begin(), dim_.end());
+
+    if (reduce_all_ || dim_.size() == 0) {
+      if (keep_dim_) {
+        out->Resize({static_cast<int64_t>(x_rank), 1});
+      } else {
+        out->Resize({1});
+      }
+    } else {
+      std::vector<int64_t> out_dims;
+      for (size_t i = 0; i < x_dims_.size(); i++) {
+        out_dims.push_back(x_dims_[i]);
+      }
+      if (keep_dim_) {
+        for (size_t i = 0; i < dim_.size(); ++i) {
+          out_dims[dim_[i]] = 1L;
+        }
+      } else {
+        int64_t kDelFlag = -2;
+        for (size_t i = 0; i < dim_.size(); ++i) {
+          out_dims[dim_[i]] = kDelFlag;
+        }
+        out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                       out_dims.end());
+      }
+      if (!keep_dim_ && out_dims.empty()) {
+        out_dims.push_back(1);
+      }
+      out->Resize(out_dims);
+    }
+
+    auto* out_data = out->mutable_data<float>();
+    if (reduce_all_ || dim_.empty()) {
+      reduce_prod_all(x_data, out_data, x_dims_.production());
+    } else {
+      CHECK_EQ(x_rank, 4U);
+      int in_n = x_dims_[0];
+      int in_c = x_dims_[1];
+      int in_h = x_dims_[2];
+      int in_w = x_dims_[3];
+
+      if (dim_.size() == 1) {
+        switch (dim_[0]) {
+          case 0:
+            reduce_prod_n(x_data, out_data, in_n, in_c, in_h, in_w);
+            break;
+          case 1:
+            reduce_prod_c(x_data, out_data, in_n, in_c, in_h, in_w);
+            break;
+          case 2:
+            reduce_prod_h(x_data, out_data, in_n, in_c, in_h, in_w);
+            break;
+          case 3:
+            reduce_prod_w(x_data, out_data, in_n, in_c, in_h, in_w);
+            break;
+          default:
+            LOG(FATAL) << "error!!!";
+        }
+      } else if (dim_.size() == 2) {
+        if (dim_[0] == 0 && dim_[1] == 1) {
+          reduce_prod_nc(x_data, out_data, in_n, in_c, in_h, in_w);
+        } else if (dim_[0] == 1 && dim_[1] == 2) {
+          reduce_prod_ch(x_data, out_data, in_n, in_c, in_h, in_w);
+        } else if (dim_[0] == 2 && dim_[1] == 3) {
+          reduce_prod_hw(x_data, out_data, in_n, in_c, in_h, in_w);
+        } else {
+          LOG(FATAL) << "invalid dims_!!";
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("reduce_prod");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetOutput("Out", {output_});
+    op_desc->SetAttr("dim", dim_);
+    op_desc->SetAttr("keep_dim", keep_dim_);
+    op_desc->SetAttr("reduce_all", reduce_all_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> data(x_dims_.production());
+    for (int i = 0; i < x_dims_.production(); i++) {
+      data[i] = (i + 1) * 1.0;
+    }
+    SetCommonTensor(input_, x_dims_, data.data());
+  }
+};
+
+void test_reduce_prod(Place place) {
+  std::vector<std::vector<int>> reduce_dim{
+      {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 2}) {
+      for (auto h : {1, 3}) {
+        for (auto w : {1, 3}) {
+          for (bool keep_dim : {false, true}) {
+            for (auto dim : reduce_dim) {
+              auto x_dims = DDim(std::vector<int64_t>({n, c, h, w}));
+              std::unique_ptr<arena::TestCase> tester(
+                  new ReduceProdComputeTester(
+                      place, "def", dim, keep_dim, x_dims, false));
+              arena::Arena arena(std::move(tester), place, 2e-5);
+              arena.TestPrecision();
+            }
+          }
+        }
+      }
+    }
+  }
+  std::vector<int> dim = {0};
+  bool keep_dim = false;
+  bool reduce_all = true;
+  auto x_dims = DDim({2, 2});
+  std::unique_ptr<arena::TestCase> tester(new ReduceProdComputeTester(
+      place, "def", dim, keep_dim, x_dims, reduce_all));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+}
+
+TEST(ReduceProd, precision) {
+// #ifdef LITE_WITH_X86
+//   Place place(TARGET(kX86));
+// #endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_reduce_prod(place);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/reduce_sum_compute_test.cc b/lite/tests/kernels/reduce_sum_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cfe213750b1191c1ef8fe7fba1b1c1035c2ae42
--- /dev/null
+++ b/lite/tests/kernels/reduce_sum_compute_test.cc
@@ -0,0 +1,350 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+void reduce_sum_n(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_c(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_h(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = 0.0;
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_w(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = 0.0;
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_all(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  float sum = 0.0;
+  int src_index;
+  int n_id, c_id;
+  for (int n = 0; n < num_in; ++n) {
+    n_id = n * channel_in * height_in * width_in;
+    for (int c = 0; c < channel_in; ++c) {
+      c_id = c * height_in * width_in;
+      for (int h = 0; h < height_in; ++h) {
+        for (int w = 0; w < width_in; ++w) {
+          src_index = n_id + c_id + h * width_in + w;
+          sum = sum + src[src_index];
+        }
+      }
+    }
+  }
+  dst[0] = sum;
+}
+
+void reduce_sum_nc(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_sum_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_sum_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+
+void reduce_sum_ch(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_sum_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_sum_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+
+void reduce_sum_hw(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_sum_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_sum_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+
+class ReduceSumComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "x";
+  std::string output_ = "out";
+  std::vector<int> dim_{0};
+  bool keep_dim_ = false;
+  bool reduce_all_ = false;
+  DDim x_dims_{{3, 2, 3, 4}};
+
+ public:
+  ReduceSumComputeTester(const Place& place,
+                         const std::string& alias,
+                         std::vector<int> dim,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DDim x_dims)
+      : TestCase(place, alias),
+        dim_(dim),
+        keep_dim_(keep_dim),
+        reduce_all_(reduce_all),
+        x_dims_(x_dims) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* x = scope->FindMutableTensor(input_);
+    const auto* x_data = x->data<float>();
+    auto* out = scope->NewTensor(output_);
+    auto x_rank = x_dims_.size();
+    if (!dim_.empty()) {
+      for (int i = 0; i < dim_.size(); i++) {
+        if (dim_[i] < 0) {
+          dim_[i] += x_rank;
+        }
+      }
+    }
+
+    sort(dim_.begin(), dim_.end());
+    std::vector<int64_t> out_dims;
+    if (reduce_all_) {
+      if (keep_dim_) {
+        out_dims.resize(x_rank);
+        for (int i = 0; i < x_rank; ++i) {
+          out_dims[i] = 1;
+        }
+      } else {
+        out_dims.push_back(1);
+      }
+    } else {
+      for (int i = 0; i < x_dims_.size(); i++) {
+        out_dims.push_back(x_dims_[i]);
+      }
+      if (keep_dim_) {
+        for (size_t i = 0; i < dim_.size(); ++i) {
+          out_dims[dim_[i]] = 1L;
+        }
+      } else {
+        int64_t kDelFlag = -2;
+        for (size_t i = 0; i < dim_.size(); ++i) {
+          out_dims[dim_[i]] = kDelFlag;
+        }
+        out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                       out_dims.end());
+      }
+    }
+    out->Resize(DDim(out_dims));
+
+    auto* out_data = out->mutable_data<float>();
+    int in_n = x_dims_[0];
+    int in_c = x_dims_[1];
+    int in_h = x_dims_[2];
+    int in_w = x_dims_[3];
+
+    if (reduce_all_) {
+      reduce_sum_all(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else if (dim_.size() == 1) {
+      switch (dim_[0]) {
+        case 0:
+          reduce_sum_n(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        case 1:
+          reduce_sum_c(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        case 2:
+          reduce_sum_h(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        case 3:
+          reduce_sum_w(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        default:
+          LOG(FATAL) << "error!!!";
+      }
+    } else if (dim_.size() == 2) {
+      if (dim_[0] == 0 && dim_[1] == 1) {
+        reduce_sum_nc(x_data, out_data, in_n, in_c, in_h, in_w);
+      } else if (dim_[0] == 1 && dim_[1] == 2) {
+        reduce_sum_ch(x_data, out_data, in_n, in_c, in_h, in_w);
+      } else if (dim_[0] == 2 && dim_[1] == 3) {
+        reduce_sum_hw(x_data, out_data, in_n, in_c, in_h, in_w);
+      } else {
+        LOG(FATAL) << "invalid dims_!!";
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("reduce_sum");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetOutput("Out", {output_});
+    op_desc->SetAttr("dim", dim_);
+    op_desc->SetAttr("keep_dim", keep_dim_);
+    op_desc->SetAttr("reduce_all", reduce_all_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> data(x_dims_.production());
+    for (int i = 0; i < x_dims_.production(); i++) {
+      data[i] = i * 1.0;
+    }
+    SetCommonTensor(input_, x_dims_, data.data());
+  }
+};
+
+void test_reduce_sum(Place place) {
+  std::vector<std::vector<int>> reduce_dim{
+      {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 2}) {
+      for (auto h : {1, 3}) {
+        for (auto w : {1, 3}) {
+          for (bool keep_dim : {false, true}) {
+            for (bool reduce_all : {false, true}) {
+              for (auto dim : reduce_dim) {
+                auto x_dims = DDim(std::vector<int64_t>({n, c, h, w}));
+                std::unique_ptr<arena::TestCase> tester(
+                    new ReduceSumComputeTester(
+                        place, "def", dim, keep_dim, reduce_all, x_dims));
+                arena::Arena arena(std::move(tester), place, 2e-5);
+                arena.TestPrecision();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(ReduceSum, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  test_reduce_sum(place);
+#endif
+  // #ifdef LITE_WITH_ARM
+  //  Place place(TARGET(kARM));
+  //  test_reduce_sum(place);
+  // #endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b82c291a4167a0c704d72a1814e9544a467d057f
--- /dev/null
+++ b/lite/tests/kernels/reshape_compute_test.cc
@@ -0,0 +1,191 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class ReshapeComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "reshape2";
+  std::string input_ = "x";
+  std::string output_ = "out";
+  std::string xshape_ = "xshape";
+  std::vector<std::string> shape_tensor_vct_;
+  std::string shape_tensor_;
+  DDim dims_;
+  std::vector<int> shape_;
+  bool inplace_ = false;
+
+ public:
+  ReshapeComputeTester(const Place& place,
+                       const std::string& alias,
+                       DDim dims,
+                       std::vector<int> shape,
+                       bool is_shape_tensor_vct = false,
+                       bool is_shape_tensor = false,
+                       bool is_shape = true)
+      : TestCase(place, alias), dims_(dims) {
+    if (is_shape_tensor_vct) {
+      for (size_t i = 0; i < shape.size(); i++) {
+        shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i));
+      }
+    } else if (is_shape_tensor) {
+      shape_tensor_ = op_type_ + "/shape";
+    } else if (is_shape) {
+      shape_ = shape;
+    } else {
+      LOG(FATAL) << "must set new shape!";
+    }
+  }
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(output_);
+    CHECK(out);
+
+    auto* x = scope->FindTensor(input_);
+
+    std::vector<int> out_shape;
+    if (shape_tensor_vct_.size() > 0) {
+      for (auto shape_tensor : shape_tensor_vct_) {
+        out_shape.push_back(scope->FindTensor(shape_tensor)->data<int>()[0]);
+      }
+    } else if (!shape_tensor_.empty()) {
+      auto shape_tensor = scope->FindTensor(shape_tensor_);
+      auto shape_tensor_data = shape_tensor->data<int>();
+      out_shape = std::vector<int>(shape_tensor_data,
+                                   shape_tensor_data + shape_tensor->numel());
+    } else if (!shape_.empty()) {
+      out_shape = shape_;
+    } else {
+      LOG(FATAL) << "must set new shape!";
+    }
+
+    std::vector<int64_t> final_out_shape(out_shape.size(), 1);
+    int unk_dim_idx = -1;
+    int cap = 1;
+    for (size_t i = 0; i < out_shape.size(); i++) {
+      if (out_shape[i] == -1) {
+        CHECK_EQ(unk_dim_idx, -1);
+        unk_dim_idx = i;
+      } else if (out_shape[i] == 0) {
+        CHECK_LE(i, dims_.size());
+        final_out_shape[i] = dims_[i];
+      } else if (out_shape[i] > 0) {
+        final_out_shape[i] = out_shape[i];
+      } else {
+        LOG(FATAL) << "invalid shape";
+      }
+      cap *= final_out_shape[i];
+    }
+
+    if (unk_dim_idx > -1) {
+      final_out_shape[unk_dim_idx] = dims_.production() / cap;
+    }
+
+    out->Resize(final_out_shape);
+
+    auto x_data = x->data<float>();
+    auto out_data = out->mutable_data<float>();
+    memcpy(out_data, x_data, sizeof(float) * dims_.production());
+
+    if (op_type_ == "reshape2") {
+      auto* xshape = scope->NewTensor(xshape_);
+      auto xshape_dims = dims_.Vectorize();
+      xshape_dims.insert(xshape_dims.begin(), 0);
+      xshape->Resize(xshape_dims);
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {input_});
+    if (shape_tensor_vct_.size() > 0) {
+      op_desc->SetInput("ShapeTensor", shape_tensor_vct_);
+    } else if (!shape_tensor_.empty()) {
+      op_desc->SetInput("Shape", {shape_tensor_});
+    } else if (shape_.size() > 0) {
+      op_desc->SetAttr("shape", shape_);
+    } else {
+      LOG(FATAL) << "invalid shape";
+    }
+    op_desc->SetOutput("Out", {output_});
+    if (op_type_ == "reshape2") {
+      op_desc->SetOutput("XShape", {xshape_});
+    }
+    op_desc->SetAttr("inplace", inplace_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
+
+    if (shape_tensor_vct_.size() > 0) {
+      for (size_t i = 0; i < shape_.size(); i++) {
+        std::vector<int> shape_data{shape_[i]};
+        SetCommonTensor(shape_tensor_vct_[i],
+                        DDim(std::vector<int64_t>{1}),
+                        shape_data.data());
+      }
+    }
+    if (!shape_tensor_.empty()) {
+      SetCommonTensor(
+          shape_tensor_,
+          DDim(std::vector<int64_t>{static_cast<int64_t>(shape_.size())}),
+          shape_.data());
+    }
+  }
+};
+
+TEST(Reshape, precision) {
+  LOG(INFO) << "test Reshape op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  DDim dims{{2, 3, 4, 5}};
+  std::vector<std::vector<int>> shapes{{5, 4, 3, 2},
+                                       {2, 3, 20},
+                                       {2, 60},
+                                       {120},
+                                       {2, 3, -1},
+                                       {0, 0, 20},
+                                       {0, 0, -1}};
+  for (auto shape : shapes) {
+#ifdef LITE_WITH_NPU
+    if (dims.size() > 4 || shape.size() > 4) continue;
+#endif
+    std::unique_ptr<arena::TestCase> tester(
+        new ReshapeComputeTester(place, "def", dims, shape));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc
index fd254c74959d27ed10dbf4c7570d8d0df867fe80..1ededcd52d3fb4c8881a391dce5e7f22e87cdb44 100644
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
@@ -16,6 +16,7 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -23,31 +24,33 @@ namespace lite {
 class ScaleComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
+  std::string x_ = "x";
+  std::string out_ = "out";
+  DDim x_dims_{{100, 20}};
   float scale_ = 0.;
   float bias_ = 0.;
-  DDim dims_{{100, 20}};
   bool bias_after_scale_;
 
  public:
   ScaleComputeTester(const Place& place,
                      const std::string& alias,
+                     const DDim& x_dims,
                      float scale,
                      float bias,
                      bool bias_after_scale)
       : TestCase(place, alias),
+        x_dims_(x_dims),
         scale_(scale),
         bias_(bias),
         bias_after_scale_(bias_after_scale) {}
 
   void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
+    auto* out = scope->NewTensor(out_);
     CHECK(out);
-    out->Resize(dims_);
+    out->Resize(x_dims_);
     auto* out_data = out->mutable_data<float>();
 
-    auto* x = scope->FindTensor(input_);
+    auto* x = scope->FindTensor(x_);
     const auto* x_data = x->data<float>();
 
     float bias = bias_;
@@ -56,61 +59,71 @@ class ScaleComputeTester : public arena::TestCase {
       bias *= scale_;
     }
 
-    for (int i = 0; i < dims_.production(); i++) {
+    for (int i = 0; i < x_dims_.production(); i++) {
       out_data[i] = x_data[i] * scale_ + bias;
     }
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("scale");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
     op_desc->SetAttr("scale", scale_);
     op_desc->SetAttr("bias", bias_);
     op_desc->SetAttr("bias_after_scale", bias_after_scale_);
   }
 
   void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
+    std::vector<float> x(x_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, x.data());
   }
 };
 
 TEST(Scale, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 4e-3;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+  abs_error = 3e-4;  // Some operations use fp16 in XPU
+#elif defined(LITE_WITH_X86)
+  place = TARGET(kX86);
+#else
+  return;
 #endif
 
-  for (float scale : {0.123, 2., -1.2}) {
-    for (float bias : {1., 0., -1.2331}) {
-      for (bool bias_before : {true, false}) {
-        std::unique_ptr<arena::TestCase> tester(
-            new ScaleComputeTester(place, "def", scale, bias, bias_before));
-        arena::Arena arena(std::move(tester), place, 2e-5);
-        arena.TestPrecision();
+  for (auto x_dims :
+       std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
+    for (float scale : {0.123, 2., -1.2}) {
+      for (float bias : {1., 0., -1.2331}) {
+        for (bool bias_after_scale : {true, false}) {
+          std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+              place, "def", DDim(x_dims), scale, bias, bias_after_scale));
+          arena::Arena arena(std::move(tester), place, abs_error);
+          arena.TestPrecision();
+        }
       }
     }
   }
 }
 
 TEST(Scale, performance) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  Place place;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_X86)
+  place = TARGET(kX86);
+#else
+  return;
 #endif
 
-  std::unique_ptr<arena::TestCase> tester(
-      new ScaleComputeTester(place, "def", 1.2, 1.1, true));
+  std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+      place, "def", DDim(std::vector<int64_t>{5, 2, 3, 4}), 1.2, 1.1, true));
 
   // To modify the arm context, one can retrive the context as follows.
   // #ifdef LITE_WITH_ARM
diff --git a/lite/tests/kernels/shuffle_channel_compute_test.cc b/lite/tests/kernels/shuffle_channel_compute_test.cc
index 66123625fae606a9022537698cdc1032abb13451..66dd7bbe37b6ecc90df60a03543445e86721a938 100644
--- a/lite/tests/kernels/shuffle_channel_compute_test.cc
+++ b/lite/tests/kernels/shuffle_channel_compute_test.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// TODO(FrostML): shaffle_channel cannot pass on CI, but ok in local machine.
-// Open this.
-/*#include <gtest/gtest.h>
+#include <gtest/gtest.h>
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -40,28 +39,29 @@ class ShuffleChannelComputeTester : public arena::TestCase {
     auto* out = scope->NewTensor(output_);
     CHECK(out);
     out->Resize(dims_);
-    auto* outputs = out->mutable_data<float>();
+    auto* out_data = out->mutable_data<float>();
+
     auto* x = scope->FindTensor(input_);
-    const auto* inputs = x->data<float>();
-    DDim x_dims = x->dims();
-    int num = x->dims()[0];
-    int channel = x->dims()[1];
-    int height = x->dims()[2];
-    int width = x->dims()[3];
-    int fea_size = channel * height * width;
+    const auto* in_data = x->data<float>();
+
+    int num = dims_[0];
+    int channel = dims_[1];
+    int height = dims_[2];
+    int width = dims_[3];
+    int feather_size = channel * height * width;
     int spatial_size = height * width;
-    int group_row = group_;
-    int group_col = channel / group_;
-    for (int k = 0; k < num; ++k) {
-      inputs += k * fea_size;
-      outputs += k * fea_size;
-      for (int i = 0; i < group_row; ++i) {
-        for (int j = 0; j < group_col; ++j) {
-          const float* p_i = inputs + (i * group_col + j) * spatial_size;
-          float* p_o = outputs + (j * group_row + i) * spatial_size;
+    int group_num = group_;
+    int group_size = channel / group_;
+    for (int n = 0; n < num; n++) {
+      for (int i = 0; i < group_num; ++i) {
+        for (int j = 0; j < group_size; ++j) {
+          const float* p_i = in_data + (i * group_size + j) * spatial_size;
+          float* p_o = out_data + (j * group_num + i) * spatial_size;
           memcpy(p_o, p_i, spatial_size * sizeof(float));
         }
       }
+      in_data += feather_size;
+      out_data += feather_size;
     }
   }
 
@@ -73,35 +73,33 @@ class ShuffleChannelComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
   }
 };
 
-void test_shuffle_channel(Place place) {
-  for (int group : {4}) {
+void test_shuffle_channel(Place place, float abs_error = 2e-5) {
+  for (int group : {2, 4, 8}) {
     std::unique_ptr<arena::TestCase> tester(
         new ShuffleChannelComputeTester(place, "def", group));
-    arena::Arena arena(std::move(tester), place, 2e-5);
+    arena::Arena arena(std::move(tester), place, abs_error);
     arena.TestPrecision();
   }
 }
 
 TEST(ShuffleChannel, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_shuffle_channel(place);
+  Place place;
+  float abs_error = 2e-5;
+#ifdef LITE_WITH_NPU
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#else
+  return;
 #endif
+
+  test_shuffle_channel(place, abs_error);
 }
 
 }  // namespace lite
 }  // namespace paddle
-*/
diff --git a/lite/tests/kernels/slice_compute_test.cc b/lite/tests/kernels/slice_compute_test.cc
index 19725d72fbca3c645a1ad806217325181ee1df8e..e8c63e2d729c931578de555cdf16cb066cd40e06 100644
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
@@ -267,14 +267,14 @@ void test_slice_tensor_list(Place place) {
 }
 
 TEST(Slice, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
 #ifdef LITE_WITH_ARM
   Place place(TARGET(kARM));
   test_slice(place);
   test_slice_tensor(place);
   test_slice_tensor_list(place);
+#elif defined(LITE_WITH_XPU)
+  Place place(TARGET(kXPU));
+  test_slice(place);
 #endif
 }
 
diff --git a/lite/tests/kernels/softmax_compute_test.cc b/lite/tests/kernels/softmax_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a91f6534ffa1f8022e2005cc83255d306adf77c1
--- /dev/null
+++ b/lite/tests/kernels/softmax_compute_test.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class SoftmaxComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "softmax";
+  DDim x_dims_{{1, 2, 3, 4}};
+  std::string x_ = "x";
+  std::string out_ = "out";
+  int axis_ = 1;
+
+ public:
+  SoftmaxComputeTest(const Place& place,
+                     const std::string& alias,
+                     DDim x_dims,
+                     int axis)
+      : TestCase(place, alias), x_dims_(x_dims), axis_(axis) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(x_);
+    auto out = scope->NewTensor(out_);
+    CHECK(out);
+    out->Resize(x_dims_);
+
+    auto x_data = x->data<float>();
+    auto out_data = out->mutable_data<float>();
+    auto x_rank = x_dims_.size();
+    if (axis_ < 0) {
+      axis_ += x_rank;
+    }
+    int axis_size = x_dims_[axis_];
+    int outer_num = x_dims_.Slice(0, axis_).production();
+    int inner_num = x_dims_.Slice(axis_ + 1, x_rank).production();
+    int compute_size = outer_num * inner_num;
+    for (int i = 0; i < compute_size; i++) {
+      int idx_inner = i % inner_num;
+      int idx_outer = (i / inner_num) * axis_size;
+      int start = idx_outer * inner_num + idx_inner;
+      int offset;
+
+      offset = start;
+      float max_data = std::numeric_limits<float>::lowest();
+      for (int j = 0; j < axis_size; j++) {
+        max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
+        offset += inner_num;
+      }
+
+      offset = start;
+      float sum_data = 0.f;
+      for (int j = 0; j < axis_size; j++) {
+        out_data[offset] = exp(x_data[offset] - max_data);
+        sum_data += out_data[offset];
+        offset += inner_num;
+      }
+
+      offset = start;
+      for (int j = 0; j < axis_size; j++) {
+        out_data[offset] /= sum_data;
+        offset += inner_num;
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("axis", axis_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> x(x_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, x.data());
+  }
+};
+
+TEST(Softmax, precision) {
+  LOG(INFO) << "test softmax op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 4e-3;  // Using fp16 in NPU
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  for (auto x_dims :
+       std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) {
+    for (auto axis : {-1, 0, 1, 2, 3}) {
+      if (axis >= x_dims.size()) continue;
+      std::unique_ptr<arena::TestCase> tester(
+          new SoftmaxComputeTest(place, "def", DDim(x_dims), axis));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/stack_compute_test.cc b/lite/tests/kernels/stack_compute_test.cc
index 543409d4ba066def417e004e46516adc05660e3b..10b289e41972eb6a9f332f0376393fdfaae94abe 100644
--- a/lite/tests/kernels/stack_compute_test.cc
+++ b/lite/tests/kernels/stack_compute_test.cc
@@ -103,13 +103,15 @@ void test_stack(Place place) {
 }
 
 TEST(Stack, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
+  Place place;
 #ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_stack(place);
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
 #endif
+  test_stack(place);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/transpose_compute_test.cc b/lite/tests/kernels/transpose_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ec010e47fe22f0bd60f0c275696f726b6f01a68
--- /dev/null
+++ b/lite/tests/kernels/transpose_compute_test.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+std::vector<int> CalStrides(const DDim& dims) {
+  int dsize = dims.size();
+  std::vector<int> strides(dsize, 1);
+  for (int i = dsize - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * dims[i + 1];
+  }
+  return strides;
+}
+
+std::vector<int> CalIndex(const std::vector<int>& strides, int offset) {
+  int dsize = strides.size();
+  std::vector<int> index(dsize, 0);
+  for (int i = 0; i < dsize; i++) {
+    index[i] = offset / strides[i];
+    offset %= strides[i];
+  }
+  return index;
+}
+
+std::vector<int> TransIndex(const std::vector<int>& in_index,
+                            const std::vector<int>& axis) {
+  std::vector<int> out_index(in_index.size(), 0);
+  for (int i = 0; i < axis.size(); i++) {
+    out_index[i] = in_index[axis[i]];
+  }
+  return out_index;
+}
+
+int CalOffset(const std::vector<int>& strides, const std::vector<int>& index) {
+  int offset = 0;
+  for (int i = 0; i < index.size(); i++) {
+    offset += strides[i] * index[i];
+  }
+  return offset;
+}
+
+class TransposeComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "transpose2";
+  std::string input_ = "x";
+  std::string output_ = "out";
+  std::string xshape_ = "xshape";
+  DDim dims_;
+  std::vector<int> axis_;
+
+ public:
+  TransposeComputeTester(const Place& place,
+                         const std::string& alias,
+                         DDim dims,
+                         std::vector<int> axis)
+      : TestCase(place, alias), dims_(dims), axis_(axis) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(output_);
+    CHECK(out);
+
+    auto* x = scope->FindTensor(input_);
+
+    std::vector<int64_t> out_shape(dims_.size(), 0);
+    for (size_t i = 0; i < dims_.size(); i++) {
+      out_shape[i] = dims_[axis_[i]];
+    }
+    out->Resize(out_shape);
+    auto out_dims = out->dims();
+
+    std::vector<int> x_strides = CalStrides(dims_);
+    std::vector<int> out_strides = CalStrides(out_dims);
+
+    auto x_data = x->data<float>();
+    auto out_data = out->mutable_data<float>();
+
+    for (int i = 0; i < dims_.production(); i++) {
+      std::vector<int> x_index = CalIndex(x_strides, i);
+      std::vector<int> out_index = TransIndex(x_index, axis_);
+      int out_offset = CalOffset(out_strides, out_index);
+      out_data[out_offset] = x_data[i];
+    }
+
+    if (op_type_ == "transpose2") {
+      auto* xshape = scope->NewTensor(xshape_);
+      auto xshape_dims = dims_.Vectorize();
+      xshape_dims.insert(xshape_dims.begin(), 0);
+      xshape->Resize(xshape_dims);
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {input_});
+    op_desc->SetOutput("Out", {output_});
+    if (op_type_ == "transpose2") {
+      op_desc->SetOutput("XShape", {xshape_});
+    }
+    op_desc->SetAttr("axis", axis_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
+  }
+};
+
+void TestTranspose2D(Place place, float abs_error) {
+  DDim x_dims{{4, 5}};
+  std::vector<std::vector<int>> axes{{0, 1}, {1, 0}};
+  for (auto axis : axes) {
+    std::unique_ptr<arena::TestCase> tester(
+        new TransposeComputeTester(place, "def", x_dims, axis));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+
+void TestTranspose3D(Place place, float abs_error) {
+  DDim x_dims{{3, 4, 5}};
+  std::vector<std::vector<int>> axes{
+      {0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {2, 1, 0}};
+  for (auto axis : axes) {
+    std::unique_ptr<arena::TestCase> tester(
+        new TransposeComputeTester(place, "def", x_dims, axis));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+
+void TestTranspose4D(Place place, float abs_error) {
+  DDim x_dims{{2, 3, 4, 5}};
+  std::vector<std::vector<int>> axes{
+      {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}, {3, 1, 0, 2}};
+  for (auto axis : axes) {
+    std::unique_ptr<arena::TestCase> tester(
+        new TransposeComputeTester(place, "def", x_dims, axis));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+
+TEST(Transpose, precision) {
+  LOG(INFO) << "test Transpose op";
+  float abs_error = 2e-5;
+  Place place;
+#ifdef LITE_WITH_XPU
+  place = TARGET(kXPU);
+#elif defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#else
+  return;
+#endif
+
+  TestTranspose2D(place, abs_error);
+  TestTranspose3D(place, abs_error);
+  TestTranspose4D(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc
index 22e475672a87dafee29d68a3824e4f8ac0c15615..d8ec2b01f787f32a00d645725717b412ef8a953a 100644
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
@@ -107,6 +107,7 @@ class UnsqueezeComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
+    SetPrecisionType(out_, PRECISION(kFloat));
     std::vector<float> in_data(dims_.production());
     for (int i = 0; i < dims_.production(); ++i) {
       in_data[i] = i;
@@ -213,6 +214,7 @@ class Unsqueeze2ComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
+    SetPrecisionType(out_, PRECISION(kFloat));
     std::vector<float> in_data(dims_.production());
     for (int i = 0; i < dims_.production(); ++i) {
       in_data[i] = i;
@@ -221,67 +223,73 @@ class Unsqueeze2ComputeTester : public arena::TestCase {
   }
 };
 
-void test_unsqueeze(Place place) {
+void test_unsqueeze(Place place, float abs_error = 2e-5) {
   for (std::vector<int> axes : {std::vector<int>({1}),
                                 std::vector<int>({0, 2}),
                                 std::vector<int>({0, -2})}) {
-    for (int N : {1}) {
-      for (int C : {3}) {
-        for (int H : {1}) {
-          for (int W : {5}) {
-            for (int input_axes_flag : {1, 2, 3}) {
-              LOG(INFO) << N << " " << C << " " << H << " " << W << " "
-                        << input_axes_flag;
-              std::unique_ptr<arena::TestCase> tester(
-                  new UnsqueezeComputeTester(
-                      place, "def", axes, DDim({N, C, H, W}), input_axes_flag));
-              arena::Arena arena(std::move(tester), place, 2e-5);
-              arena.TestPrecision();
-            }
-          }
-        }
+    for (auto dims : std::vector<std::vector<int64_t>>{{3}, {3, 5}, {3, 5, 7}})
+      for (int input_axes_flag : {1, 2, 3}) {
+#ifdef LITE_WITH_NPU
+        if (input_axes_flag != 1) continue;
+        if (dims.size() + axes.size() > 4) continue;
+#endif
+        std::unique_ptr<arena::TestCase> tester(new UnsqueezeComputeTester(
+            place, "def", axes, DDim(dims), input_axes_flag));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
       }
-    }
   }
 }
 
-void test_unsqueeze2(Place place) {
+void test_unsqueeze2(Place place,
+                     float abs_error = 2e-5,
+                     std::vector<std::string> ignored_outs = {}) {
   for (std::vector<int> axes : {std::vector<int>({0}),
                                 std::vector<int>({0, 2}),
                                 std::vector<int>({0, -2})}) {
-    for (int N : {1}) {
-      for (int C : {3}) {
-        for (int H : {1}) {
-          for (int W : {5}) {
-            std::unique_ptr<arena::TestCase> tester(new Unsqueeze2ComputeTester(
-                place, "def", axes, DDim({N, C, H, W})));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
+    for (auto dims :
+         std::vector<std::vector<int64_t>>{{3}, {3, 5}, {3, 5, 7}}) {
+#ifdef LITE_WITH_NPU
+      if (dims.size() + axes.size() > 4) continue;
+#endif
+      std::unique_ptr<arena::TestCase> tester(
+          new Unsqueeze2ComputeTester(place, "def", axes, DDim(dims)));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision(ignored_outs);
     }
   }
 }
 
 TEST(squeeze, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_unsqueeze(place);
+  Place place;
+  float abs_error = 2e-5;
+#ifdef LITE_WITH_NPU
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_unsqueeze(place, abs_error);
 }
 
 TEST(squeeze2, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_unsqueeze2(place);
+  Place place;
+  float abs_error = 2e-5;
+  std::vector<std::string> ignored_outs = {};
+#ifdef LITE_WITH_NPU
+  place = TARGET(kNPU);
+  abs_error = 1e-2;                  // Using fp16 in NPU
+  ignored_outs.push_back("XShape");  // not supported out in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_unsqueeze2(place, abs_error, ignored_outs);
 }
 
 }  // namespace lite
diff --git a/lite/tests/math/CMakeLists.txt b/lite/tests/math/CMakeLists.txt
index d2acd14c83352c40e66781a9152a2f619918ddf2..7dd4f522dbc0f10e8cfb7d19e95da4354ac4b779 100644
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
@@ -1,10 +1,17 @@
 if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
     lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(gemm_int8_compute_test SRCS gemm_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(gemv_int8_compute_test SRCS gemv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(conv_compute_test SRCS conv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(conv_transpose_compute_test SRCS conv_transpose_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(conv_int8_compute_test SRCS conv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(pool_compute_test SRCS pool_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+
+    if(LITE_BUILD_EXTRA)
+        lite_cc_test(layout_compute_test SRCS layout_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    endif()
+    
+
 endif()
diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc
index bfb74e6e0a6f5ea0cae199f1c7dc5f1c03e83363..ceb35ffb6e4c728904d1f63f96f16434a561e904 100644
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_compute.h"
@@ -46,39 +46,50 @@ DEFINE_int32(out_channel, 32, "output channel");
 DEFINE_int32(group, 1, "group");
 DEFINE_int32(kernel_h, 3, "kernel height");
 DEFINE_int32(kernel_w, 3, "kernel width");
-DEFINE_int32(pad_h, 1, "pad height");
-DEFINE_int32(pad_w, 1, "pad width");
+DEFINE_int32(pad_h0, 1, "pad top");
+DEFINE_int32(pad_h1, 1, "pad bottom");
+DEFINE_int32(pad_w0, 1, "pad left");
+DEFINE_int32(pad_w1, 1, "pad right");
 DEFINE_int32(stride_h, 1, "stride height");
 DEFINE_int32(stride_w, 1, "stride width");
 DEFINE_int32(dila_h, 1, "dilation height");
 DEFINE_int32(dila_w, 1, "dilation width");
 
-DEFINE_bool(flag_relu, true, "do relu");
+DEFINE_int32(flag_act,
+             0,
+             "do activation");  // 0-no act, 1-relu, 2-relu6, 4-leakyrelu
+DEFINE_double(leakey_relu_alpha, 1.0, "leakey relu alpha");
 DEFINE_bool(flag_bias, true, "with bias");
 
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+typedef paddle::lite::operators::ActivationParam ActivationParam;
+
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
   DDim dim_out = dim_in;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   dim_out[1] = param.filter->dims()[0];
   auto kernel_h = param.filter->dims()[2];
   auto kernel_w = param.filter->dims()[3];
   auto h = dim_in[2];
   auto w = dim_in[3];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int dila_h = dilations[0];
+  int dila_w = dilations[1];
+  int pad_top = paddings[0];
+  int pad_bottom = paddings[1];
+  int pad_left = paddings[2];
+  int pad_right = paddings[3];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
   auto kernel_exten = dila_h * (kernel_h - 1) + 1;
-  auto hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1;
+  auto hout = (h + pad_top + pad_bottom - kernel_exten) / stride_h + 1;
   kernel_exten = dila_w * (kernel_w - 1) + 1;
-  auto wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1;
+  auto wout = (w + pad_left + pad_right - kernel_exten) / stride_w + 1;
   dim_out[2] = hout;
   dim_out[3] = wout;
   return dim_out;
@@ -92,9 +103,10 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
                     const std::vector<int>& pads,
                     const std::vector<int>& dilas,
                     bool flag_bias,
-                    bool flag_relu,
+                    int flag_act,
                     const std::vector<int>& thread_num,
-                    const std::vector<int>& power_mode) {
+                    const std::vector<int>& power_mode,
+                    const float leakey_relu_scale) {
 #ifdef LITE_WITH_ARM
   paddle::lite::DeviceInfo::Init();
 #endif
@@ -110,10 +122,24 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
     param.bias->set_precision(PRECISION(kFloat));
   }
   param.strides = strides;
-  param.paddings = pads;
-  param.dilations = dilas;
-  param.fuse_relu = flag_relu;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilas);
   param.groups = group;
+  const float six = 6.f;
+  if (flag_act > 0) {
+    ActivationParam act_param;
+    act_param.has_active = true;
+    act_param.active_type = (paddle::lite_api::ActivationType)
+        flag_act;  // 1-relu, 2-relu6, 4-leakyrelu
+    if (flag_act == 1) {
+      param.fuse_relu = true;
+    } else if (flag_act == 2) {
+      act_param.Relu_clipped_coef = six;
+    } else if (flag_act == 4) {
+      act_param.Leaky_relu_alpha = leakey_relu_scale;
+    }
+    param.activation_param = act_param;
+  }
 
   param.output = new Tensor;
   param.output->set_precision(PRECISION(kFloat));
@@ -162,7 +188,7 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
         param.output->Resize(dim_out);
 
         paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
-        //        paddle::lite::fill_tensor_const(*param.x, 1.f);
+        // paddle::lite::fill_tensor_const(*param.x, 1.f);
         auto din = param.x->data<float>();
 
         Tensor tout_basic;
@@ -189,10 +215,12 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
                                    strides[0],
                                    dilas[1],
                                    dilas[0],
-                                   pads[1],
+                                   pads[2],
                                    pads[0],
                                    flag_bias,
-                                   flag_relu);
+                                   flag_act,
+                                   six,
+                                   leakey_relu_scale);
         }
         /// warm up
         for (int i = 0; i < FLAGS_warmup; ++i) {
@@ -201,19 +229,19 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] *
                       weight_dim[3] / param.groups;
         LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
@@ -235,25 +263,26 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test fp32 conv: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
+                         << ", group: " << group
                          << ", bias: " << (flag_bias ? "true" : "false")
-                         << ", relu: " << (flag_relu ? "true" : "false")
-                         << ", threads: " << th << ", power_mode: " << cls
-                         << " failed!!\n";
+                         << ", act: " << flag_act << ", threads: " << th
+                         << ", power_mode: " << cls << " failed!!\n";
             }
           }
         }
         LOG(INFO) << "test fp32 conv: input: " << dim_in
                   << ", output: " << dim_out << ", weight dim: " << weight_dim
-                  << ", pad: " << pads[0] << ", " << pads[1]
-                  << ", stride: " << strides[0] << ", " << strides[1]
-                  << ", dila_: " << dilas[0] << ", " << dilas[1]
+                  << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
+                  << ", " << pads[3] << ", stride: " << strides[0] << ", "
+                  << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
+                  << ", group: " << group
                   << ", bias: " << (flag_bias ? "true" : "false")
-                  << ", relu: " << (flag_relu ? "true" : "false")
-                  << ", threads: " << th << ", power_mode: " << cls
-                  << " successed!!\n";
+                  << ", act: " << flag_act << ", threads: " << th
+                  << ", power_mode: " << cls << " successed!!\n";
       }
     }
   }
@@ -271,36 +300,46 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
                     const std::vector<int>& pads,
                     const std::vector<int>& dilas,
                     bool flag_bias,
-                    bool flag_relu,
+                    int flag_act,
                     const std::vector<int>& thread_num,
-                    const std::vector<int>& power_mode) {}
+                    const std::vector<int>& power_mode,
+                    const float leakey_relu_scale) {}
 #endif  // LITE_WITH_ARM
 
-#if 1  /// 3x3dw
+// TODO(chenjiaoAngel): fix multi-threds, diff: 3x3 depthwise conv
+#if 1  // 3x3dw
 TEST(TestConv3x3DW, test_conv3x3_depthwise) {
   if (FLAGS_basic_test) {
     for (auto& stride : {1, 2}) {
-      for (auto& pad : {0, 1}) {
-        for (auto& flag_bias : {false, true}) {
-          for (auto& flag_relu : {false, true}) {
-            for (auto& c : {1, 3, 5, 8, 16, 32}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({c, 1, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 3, 15, 19, 28, 32, 75}) {
-                  dims.push_back(DDim({batch, c, h, h}));
+      for (auto& pad_left : {0, 1, 2}) {
+        for (auto& pad_right : {0, 1, 2}) {
+          for (auto& pad_top : {0, 1, 2}) {
+            for (auto& pad_bottom : {0, 1, 2}) {
+              for (auto& flag_bias : {false, true}) {
+                for (auto& flag_act : {0, 1, 2, 4}) {
+                  for (auto& c : {1, 3, 5, 8, 16, 32}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({c, 1, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 3, 15, 19, 28, 32, 75}) {
+                        dims.push_back(DDim({batch, c, h, h}));
+                      }
+                    }
+                    const float leakey_relu_scale = 8.88;
+                    test_conv_fp32(dims,
+                                   weights_dim,
+                                   c,
+                                   {stride, stride},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_act,
+                                   {1},
+                                   {FLAGS_power_mode},
+                                   leakey_relu_scale);
+                  }
                 }
               }
-              test_conv_fp32(dims,
-                             weights_dim,
-                             c,
-                             {stride, stride},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -314,27 +353,35 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
 TEST(TestConv5x5DW, test_conv5x5_depthwise) {
   if (FLAGS_basic_test) {
     for (auto& stride : {1, 2}) {
-      for (auto& pad : {0, 1, 2}) {
-        for (auto& flag_bias : {false, true}) {
-          for (auto& flag_relu : {false, true}) {
-            for (auto& c : {1, 3, 5, 8, 16, 32}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({c, 1, 5, 5});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 3, 15, 19, 28, 32, 75}) {
-                  dims.push_back(DDim({batch, c, h, h}));
+      for (auto& pad_left : {0, 1, 2}) {
+        for (auto& pad_right : {0, 1, 2}) {
+          for (auto& pad_top : {0, 1, 2}) {
+            for (auto& pad_bottom : {0, 1, 2}) {
+              for (auto& flag_bias : {false, true}) {
+                for (auto& flag_act : {0, 1, 2, 4}) {
+                  for (auto& c : {1, 15, 32}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({c, 1, 5, 5});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 3, 15, 56}) {
+                        dims.push_back(DDim({batch, c, h, h}));
+                      }
+                    }
+                    const float leakey_relu_scale = 8.88;
+                    test_conv_fp32(dims,
+                                   weights_dim,
+                                   c,
+                                   {stride, stride},
+                                   {pad_left, pad_right, pad_top, pad_bottom},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_act,
+                                   {4},
+                                   {FLAGS_power_mode},
+                                   leakey_relu_scale);
+                  }
                 }
               }
-              test_conv_fp32(dims,
-                             weights_dim,
-                             c,
-                             {stride, stride},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -351,7 +398,7 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
       for (auto& cout : {1, 5, 16, 37}) {
         for (auto& g : {1, 2}) {
           for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
+            for (auto& flag_act : {0, 1, 2, 4}) {
               std::vector<DDim> dims;
               if (cin % g != 0 || cout % g != 0) {
                 continue;
@@ -362,16 +409,18 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
                   dims.push_back(DDim({batch, cin, h, h}));
                 }
               }
+              const float leakey_relu_scale = 8.88;
               test_conv_fp32(dims,
                              weights_dim,
                              g,
                              {1, 1},
-                             {0, 0},
+                             {0, 0, 0, 0},
                              {1, 1},
                              flag_bias,
-                             flag_relu,
+                             flag_act,
                              {1, 2, 4},
-                             {FLAGS_power_mode});
+                             {FLAGS_power_mode},
+                             leakey_relu_scale);
             }
           }
         }
@@ -381,31 +430,43 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
 }
 #endif  /// conv1x1s1
 
-#if 1  /// conv3x3s1
+// TODO(MyPandaShaoxiang): fix me, diff: 3x3s1 winograd
+#if 0   /// conv3x3s1
 TEST(TestConv3x3s1, test_conv_3x3s1) {
   if (FLAGS_basic_test) {
-    for (auto& cin : {1, 3, 8, 32, 48}) {
-      for (auto& cout : {1, 5, 8, 32, 48}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+    for (auto& cin : {1, 3, 8, 8}) {
+      for (auto& cout : {1, 5, 32, 48}) {
+        for (auto& pad_left : {0, 1, 2}) {
+          for (auto& pad_right : {0, 1, 2}) {
+            for (auto& pad_top : {0, 1, 2}) {
+              for (auto& pad_bottom : {0, 1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_act : {0, 1, 2, 4}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 3, 17, 33}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    if (cin == 1 && cout == 1) {
+                      continue;
+                    }
+                    const float leakey_relu_scale = 8.88;
+                    test_conv_fp32(dims,
+                                   weights_dim,
+                                   1,
+                                   {1, 1},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_act,
+                                   {4},
+                                   {FLAGS_power_mode},
+                                   leakey_relu_scale);
+                  }
                 }
               }
-              test_conv_fp32(dims,
-                             weights_dim,
-                             1,
-                             {1, 1},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -418,28 +479,39 @@ TEST(TestConv3x3s1, test_conv_3x3s1) {
 #if 1  /// conv3x3s2
 TEST(TestConv3x3s2, test_conv_3x3s2) {
   if (FLAGS_basic_test) {
-    for (auto& cin : {1, 3, 8, 32}) {
-      for (auto& cout : {1, 5, 8, 32}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+    for (auto& cin : {1, 3, 8}) {
+      for (auto& cout : {1, 3, 9, 32}) {
+        for (auto& pad_left : {0, 1, 2}) {
+          for (auto& pad_right : {0, 1, 2}) {
+            for (auto& pad_top : {0, 1, 2}) {
+              for (auto& pad_bottom : {0, 1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_act : {0, 1, 2, 4}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {3, 7, 15, 56, 32}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    if (cin == 1 && cout == 1) {
+                      continue;
+                    }
+                    const float leakey_relu_scale = 8.88;
+                    test_conv_fp32(dims,
+                                   weights_dim,
+                                   1,
+                                   {2, 2},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_act,
+                                   {1, 2, 4},
+                                   {FLAGS_power_mode},
+                                   leakey_relu_scale);
+                  }
                 }
               }
-              test_conv_fp32(dims,
-                             weights_dim,
-                             1,
-                             {2, 2},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -452,36 +524,55 @@ TEST(TestConv3x3s2, test_conv_3x3s2) {
 #if 1  /// random param conv
 TEST(TestConvRand, test_conv_rand) {
   if (FLAGS_basic_test) {
-    for (auto& cin : {1, 3, 8, 16}) {
-      for (auto& cout : {1, 5, 8, 16}) {
+    for (auto& cin : {1, 3, 8}) {
+      for (auto& cout : {1, 5, 16}) {
         for (auto& g : {1, 2}) {
           for (auto& kw : {1, 2, 3}) {
             for (auto& kh : {1, 2, 3}) {
               for (auto& stride : {1, 2}) {
-                for (auto& pad : {0, 1, 2}) {
-                  for (auto& dila : {1, 2}) {
-                    for (auto& flag_bias : {false, true}) {
-                      for (auto& flag_relu : {false, true}) {
-                        if (cin % g != 0 || cout % g != 0) {
-                          continue;
-                        }
-                        std::vector<DDim> dims;
-                        DDim weights_dim({cout, cin / g, kh, kw});
-                        for (auto& batch : {1, 2}) {
-                          for (auto& h : {1, 3, 19, 32, 28}) {
-                            dims.push_back(DDim({batch, cin, h, h}));
+                for (auto& pad_left : {0, 2}) {
+                  for (auto& pad_right : {0, 2}) {
+                    for (auto& pad_top : {0, 2}) {
+                      for (auto& pad_bottom : {0, 2}) {
+                        for (auto& dila : {1, 2}) {
+                          for (auto& flag_bias : {false, true}) {
+                            for (auto& flag_act : {0, 1, 2, 4}) {
+                              if (cin % g != 0 || cout % g != 0) {
+                                continue;
+                              }
+                              std::vector<DDim> dims;
+                              DDim weights_dim({cout, cin / g, kh, kw});
+                              for (auto& batch : {1, 2}) {
+                                for (auto& h : {1, 3, 19, 32}) {
+                                  dims.push_back(DDim({batch, cin, h, h}));
+                                }
+                              }
+                              // skip 3x3 depthwise conv
+                              if (g == cin && cin == cout && kw == 3 &&
+                                  kh == 3) {
+                                break;
+                              }
+                              // skip 3x3s1 direct conv
+                              if (g == 1 && (cin != 1 || cout != 1) &&
+                                  kw == 3 && kh == 3 && stride == 1) {
+                                break;
+                              }
+                              const float leakey_relu_scale = 8.88;
+                              test_conv_fp32(
+                                  dims,
+                                  weights_dim,
+                                  g,
+                                  {stride, stride},
+                                  {pad_top, pad_bottom, pad_left, pad_right},
+                                  {dila, dila},
+                                  flag_bias,
+                                  flag_act,
+                                  {4},
+                                  {FLAGS_power_mode},
+                                  leakey_relu_scale);
+                            }
                           }
                         }
-                        test_conv_fp32(dims,
-                                       weights_dim,
-                                       g,
-                                       {stride, stride},
-                                       {pad, pad},
-                                       {dila, dila},
-                                       flag_bias,
-                                       flag_relu,
-                                       {1, 2, 4},
-                                       {FLAGS_power_mode});
                       }
                     }
                   }
@@ -510,11 +601,12 @@ TEST(TestConvCustom, test_conv_fp32_custom_size) {
             FLAGS_kernel_w}),
       FLAGS_group,
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h0, FLAGS_pad_h1, FLAGS_pad_w0, FLAGS_pad_w1},
       {FLAGS_dila_h, FLAGS_dila_w},
       FLAGS_flag_bias,
-      FLAGS_flag_relu,
+      FLAGS_flag_act,
       {FLAGS_threads},
-      {FLAGS_power_mode});
+      {FLAGS_power_mode},
+      FLAGS_leakey_relu_alpha);
 }
 #endif  // custom
diff --git a/lite/tests/math/conv_int8_compute_test.cc b/lite/tests/math/conv_int8_compute_test.cc
index e15b7d22bc2a5859db73f21aa54b1bcdaabf4d2c..b53bbe780b722bd1686668a8ad0c2a1a98b1e8c8 100644
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_compute.h"
@@ -59,26 +59,26 @@ DEFINE_bool(flag_bias, true, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   DDim dim_out = dim_in;
   dim_out[1] = param.filter->dims()[0];
   auto kernel_h = param.filter->dims()[2];
   auto kernel_w = param.filter->dims()[3];
   auto h = dim_in[2];
   auto w = dim_in[3];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int dila_h = dilations[0];
+  int dila_w = dilations[1];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
   auto kernel_exten = dila_h * (kernel_h - 1) + 1;
-  auto hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1;
+  auto hout = (h + paddings[0] + paddings[1] - kernel_exten) / stride_h + 1;
   kernel_exten = dila_w * (kernel_w - 1) + 1;
-  auto wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1;
+  auto wout = (w + paddings[2] + paddings[3] - kernel_exten) / stride_w + 1;
   dim_out[2] = hout;
   dim_out[3] = wout;
   return dim_out;
@@ -104,8 +104,8 @@ void get_conv_param(const DDim& dim_w,
     param->bias->set_precision(PRECISION(kFloat));
   }
   param->strides = strides;
-  param->paddings = pads;
-  param->dilations = dila;
+  param->paddings = std::make_shared<std::vector<int>>(pads);
+  param->dilations = std::make_shared<std::vector<int>>(dila);
   param->fuse_relu = flag_relu;
   param->groups = g;
 
@@ -288,10 +288,10 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
                                    strides[0],
                                    dilas[1],
                                    dilas[0],
-                                   pads[1],
+                                   pads[2],
                                    pads[0],
                                    flag_bias,
-                                   flag_relu);
+                                   static_cast<int>(flag_relu));
           paddle::lite::arm::math::fp32_to_int8(dout_basic_fp32,
                                                 dout_basic_int8,
                                                 scale_out.data(),
@@ -309,30 +309,30 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
         /// compute fp32 output
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_int8_fp32.Launch();
-          t0.end();
+          t0.Stop();
         }
         LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out
-                  << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         /// compute int8 output
-        t0.clear();
+        t0.Reset();
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_int8_int8.Launch();
-          t0.end();
+          t0.Stop();
         }
         LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out
-                  << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         /// compare result fp32 output
         if (FLAGS_check_result) {
@@ -358,9 +358,11 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test int8 conv, fp32 out: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
+                         << ", group: " << group
                          << ", bias: " << (flag_bias ? "true" : "false")
                          << ", relu: " << (flag_relu ? "true" : "false")
                          << ", threads: " << th << ", power_mode: " << cls
@@ -416,7 +418,8 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test int8 conv, int8 out: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
                          << ", bias: " << (flag_bias ? "true" : "false")
@@ -428,9 +431,9 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
         }
         LOG(INFO) << "test int8 conv: input: " << dim_in
                   << ", output: " << dim_out << ", weight dim: " << weight_dim
-                  << ", pad: " << pads[0] << ", " << pads[1]
-                  << ", stride: " << strides[0] << ", " << strides[1]
-                  << ", dila_: " << dilas[0] << ", " << dilas[1]
+                  << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
+                  << ", " << pads[3] << ", stride: " << strides[0] << ", "
+                  << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
                   << ", bias: " << (flag_bias ? "true" : "false")
                   << ", relu: " << (flag_relu ? "true" : "false")
                   << ", threads: " << th << ", power_mode: " << cls
@@ -454,7 +457,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
                     const std::vector<int>& power_mode) {}
 #endif  // LITE_WITH_ARM
 
-#if 1  /// 3x3dw
+#if 0   /// 3x3dw
 TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
   if (FLAGS_basic_test) {
     for (auto& stride : {1, 2}) {
@@ -465,7 +468,7 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
               std::vector<DDim> dims;
               DDim weights_dim({c, 1, 3, 3});
               for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 3, 15, 19, 75, 32, 28}) {
+                for (auto& h : {1, 3, 15, 33}) {
                   dims.push_back(DDim({batch, c, h, h}));
                 }
               }
@@ -473,11 +476,11 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
                              weights_dim,
                              c,
                              {stride, stride},
-                             {pad, pad},
+                             {pad, pad, pad, pad},
                              {1, 1},
                              flag_bias,
                              flag_relu,
-                             {1, 2, 4},
+                             {4},
                              {FLAGS_power_mode});
             }
           }
@@ -491,15 +494,15 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
 #if 1  /// 5x5dw
 TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
   if (FLAGS_basic_test) {
-    for (auto& stride : {1}) {
-      for (auto& pad : {0, 1, 2}) {
+    for (auto& stride : {1, 2}) {
+      for (auto& pad : {0, 1, 2, 3, 4}) {
         for (auto& flag_bias : {false, true}) {
           for (auto& flag_relu : {false, true}) {
-            for (auto& c : {1, 3, 5, 8, 16, 32}) {
+            for (auto& c : {1, 5, 15, 33}) {
               std::vector<DDim> dims;
               DDim weights_dim({c, 1, 5, 5});
               for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 3, 15, 19, 28, 32, 75}) {
+                for (auto& h : {1, 3, 15, 33}) {
                   dims.push_back(DDim({batch, c, h, h}));
                 }
               }
@@ -507,11 +510,11 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
                              weights_dim,
                              c,
                              {stride, stride},
-                             {pad, pad},
+                             {pad, pad, pad, pad},
                              {1, 1},
                              flag_bias,
                              flag_relu,
-                             {1, 2, 4},
+                             {4},
                              {FLAGS_power_mode});
             }
           }
@@ -522,11 +525,11 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
 }
 #endif  /// 5x5dw
 
-#if 1  /// conv1x1s1
+#if 0   /// conv1x1s1
 TEST(TestConv1x1s1Int8, test_conv1x1s1) {
   if (FLAGS_basic_test) {
-    for (auto& cin : {1, 3, 8, 11, 32}) {
-      for (auto& cout : {1, 5, 16, 37}) {
+    for (auto& cin : {1, 3, 8, 32}) {
+      for (auto& cout : {1, 5, 17}) {
         for (auto& g : {1, 2}) {
           for (auto& flag_bias : {false, true}) {
             for (auto& flag_relu : {false, true}) {
@@ -536,7 +539,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
               }
               DDim weights_dim({cout, cin / g, 1, 1});
               for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 28, 32, 56, 1}) {
+                for (auto& h : {1, 9, 16, 33}) {
                   dims.push_back(DDim({batch, cin, h, h}));
                 }
               }
@@ -544,11 +547,11 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
                              weights_dim,
                              g,
                              {1, 1},
-                             {0, 0},
+                             {0, 0, 0, 0},
                              {1, 1},
                              flag_bias,
                              flag_relu,
-                             {1, 2, 4},
+                             {4},
                              {FLAGS_power_mode});
             }
           }
@@ -559,31 +562,37 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
 }
 #endif  /// conv1x1s1
 
-#if 1  /// conv3x3s1
+#if 0   /// conv3x3s1
 TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
   if (FLAGS_basic_test) {
-    for (auto& cin : {1, 3, 8, 32, 48}) {
-      for (auto& cout : {1, 5, 8, 32, 48}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+    for (auto& cin : {1, 3, 8, 33}) {
+      for (auto& cout : {1, 5, 33}) {
+        for (auto& pad_top : {1, 2}) {
+          for (auto& pad_bottom : {1, 2}) {
+            for (auto& pad_left : {1, 2}) {
+              for (auto& pad_right : {1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_relu : {false, true}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 7, 17, 33}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    test_conv_int8(dims,
+                                   weights_dim,
+                                   1,
+                                   {1, 1},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_int8(dims,
-                             weights_dim,
-                             1,
-                             {1, 1},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -593,31 +602,37 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
 }
 #endif  /// conv3x3s1
 
-#if 1  /// conv3x3s2
+#if 0   /// conv3x3s2
 TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
   if (FLAGS_basic_test) {
-    for (auto& cin : {1, 3, 8, 32}) {
-      for (auto& cout : {1, 5, 8, 32}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+    for (auto& cin : {1, 3, 31}) {
+      for (auto& cout : {1, 5, 33}) {
+        for (auto& pad_top : {1, 2}) {
+          for (auto& pad_bottom : {1, 2}) {
+            for (auto& pad_left : {1, 2}) {
+              for (auto& pad_right : {1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_relu : {false, true}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 7, 19, 33}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    test_conv_int8(dims,
+                                   weights_dim,
+                                   1,
+                                   {2, 2},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_int8(dims,
-                             weights_dim,
-                             1,
-                             {2, 2},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -627,39 +642,46 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
 }
 #endif  /// conv3x3s2
 
-#if 1  /// random param conv
+#if 0   /// random param conv
 TEST(TestConvRandInt8, test_conv_rand) {
   if (FLAGS_basic_test) {
-    for (auto& cin : {1, 3, 8, 16}) {
-      for (auto& cout : {1, 5, 8, 16}) {
+    for (auto& cin : {1, 17}) {
+      for (auto& cout : {1, 8, 17}) {
         for (auto& g : {1, 2}) {
           for (auto& kw : {1, 2, 3}) {
             for (auto& kh : {1, 2, 3}) {
               for (auto& stride : {1, 2}) {
-                for (auto& pad : {0, 1, 2}) {
-                  for (auto& dila : {1, 2}) {
-                    for (auto& flag_bias : {false, true}) {
-                      for (auto& flag_relu : {false, true}) {
-                        if (cin % g != 0 || cout % g != 0) {
-                          continue;
-                        }
-                        std::vector<DDim> dims;
-                        DDim weights_dim({cout, cin / g, kh, kw});
-                        for (auto& batch : {1, 2}) {
-                          for (auto& h : {1, 3, 19, 32, 28}) {
-                            dims.push_back(DDim({batch, cin, h, h}));
+                for (auto& pad_top : {0, 1, 2}) {
+                  for (auto& pad_bottom : {0, 1, 2}) {
+                    for (auto& pad_left : {0, 1, 2}) {
+                      for (auto& pad_right : {0, 1, 2}) {
+                        for (auto& dila : {1, 2}) {
+                          for (auto& flag_bias : {false, true}) {
+                            for (auto& flag_relu : {false, true}) {
+                              if (cin % g != 0 || cout % g != 0) {
+                                break;
+                              }
+                              std::vector<DDim> dims;
+                              DDim weights_dim({cout, cin / g, kh, kw});
+                              for (auto& batch : {1, 2}) {
+                                for (auto& h : {1, 3, 5, 19}) {
+                                  dims.push_back(DDim({batch, cin, h, h}));
+                                }
+                              }
+                              test_conv_int8(
+                                  dims,
+                                  weights_dim,
+                                  g,
+                                  {stride, stride},
+                                  {pad_top, pad_bottom, pad_left, pad_right},
+                                  {dila, dila},
+                                  flag_bias,
+                                  flag_relu,
+                                  {4},
+                                  {FLAGS_power_mode});
+                            }
                           }
                         }
-                        test_conv_int8(dims,
-                                       weights_dim,
-                                       g,
-                                       {stride, stride},
-                                       {pad, pad},
-                                       {dila, dila},
-                                       flag_bias,
-                                       flag_relu,
-                                       {1, 2, 4},
-                                       {FLAGS_power_mode});
                       }
                     }
                   }
@@ -688,7 +710,7 @@ TEST(TestConvCustomInt8, test_conv_custom_size) {
             FLAGS_kernel_w}),
       FLAGS_group,
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
       {FLAGS_dila_h, FLAGS_dila_w},
       FLAGS_flag_bias,
       FLAGS_flag_relu,
diff --git a/lite/tests/math/conv_transpose_compute_test.cc b/lite/tests/math/conv_transpose_compute_test.cc
index e0da07a53462cf902107efc0b6daaeef819f3288..b5df49aa36e116d27c99d7c45e7e506544e53482 100644
--- a/lite/tests/math/conv_transpose_compute_test.cc
+++ b/lite/tests/math/conv_transpose_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_transpose_compute.h"
@@ -59,17 +59,20 @@ DEFINE_bool(flag_bias, false, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+typedef paddle::lite::operators::ActivationParam ActivationParam;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
   auto filter_dims = param.filter->dims();
   DDim output_shape = dim_in;
   output_shape[1] = filter_dims[1] * param.groups;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   for (int i = 0; i < 2; i++) {
-    int kernel_extent = param.dilations[i] * (filter_dims[i + 2] - 1) + 1;
+    int kernel_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
     int output_len = (dim_in[i + 2] - 1) * param.strides[i] + kernel_extent -
-                     2 * param.paddings[i];
+                     (paddings[2 * i] + paddings[2 * i + 1]);
     output_shape[i + 2] = output_len;
   }
   return output_shape;
@@ -101,19 +104,26 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
     param.bias->set_precision(PRECISION(kFloat));
   }
   param.strides = strides;
-  param.paddings = pads;
-  param.dilations = dilas;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilas);
   param.fuse_relu = flag_relu;
   param.groups = group;
 
   param.output = new Tensor;
   param.output->set_precision(PRECISION(kFloat));
 
-  //  paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f);
-  paddle::lite::fill_tensor_const(*param.filter, 1.f);
+  paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f);
+  // paddle::lite::fill_tensor_const(*param.filter, 1.f);
   if (flag_bias) {
-    //    paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
-    paddle::lite::fill_tensor_const(*param.bias, 1.f);
+    paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
+    // paddle::lite::fill_tensor_const(*param.bias, 1.f);
+  }
+  if (flag_relu) {
+    ActivationParam act_param;
+    act_param.has_active = true;
+    act_param.active_type =
+        (paddle::lite_api::ActivationType)1;  // 2-relu6 4-leakyrelu
+    param.activation_param = act_param;
   }
   Tensor tmp_weights;
   tmp_weights.Resize(weight_dim);
@@ -128,21 +138,8 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
           new paddle::lite::KernelContext);
       auto& ctx = ctx1->As<paddle::lite::ARMContext>();
       ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
-      /// set param and context
-      for (auto& dim_in : input_dims) {
-        param.x->Resize(dim_in);
-        DDim out_tmp_dims = compute_out_dim(dim_in, param);
-        if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) {
-          continue;
-        }
-        param.output->Resize(out_tmp_dims);
-        break;
-      }
       conv_t.SetParam(param);
       conv_t.SetContext(std::move(ctx1));
-      /// prepare for run
-      conv_t.PrepareForRun();
-
       for (auto& dim_in : input_dims) {
         CHECK_EQ(weight_dim[0], dim_in[1])
             << "input channel must equal to weights channel";
@@ -152,9 +149,11 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
         }
         param.x->Resize(dim_in);
         param.output->Resize(dim_out);
-
-        //        paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
-        paddle::lite::fill_tensor_const(*param.x, 1.f);
+        param.filter->CopyDataFrom(tmp_weights);
+        // prepare for run
+        conv_t.PrepareForRun();
+        paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
+        // paddle::lite::fill_tensor_const(*param.x, 1.f);
         auto din = param.x->data<float>();
 
         Tensor tout_basic;
@@ -182,8 +181,10 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
                                      strides[0],
                                      dilas[1],
                                      dilas[0],
-                                     pads[1],
+                                     pads[2],
+                                     pads[3],
                                      pads[0],
+                                     pads[1],
                                      flag_bias,
                                      flag_relu);
         }
@@ -194,19 +195,19 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_t.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         float gops =
             2.f * tmp_weights.numel() * dim_in[0] * dim_in[2] * dim_in[3];
         LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
@@ -228,7 +229,8 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test fp32 conv: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
                          << ", bias: " << (flag_bias ? "true" : "false")
@@ -240,9 +242,9 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
         }
         LOG(INFO) << "test fp32 conv: input: " << dim_in
                   << ", output: " << dim_out << ", weight dim: " << weight_dim
-                  << ", pad: " << pads[0] << ", " << pads[1]
-                  << ", stride: " << strides[0] << ", " << strides[1]
-                  << ", dila_: " << dilas[0] << ", " << dilas[1]
+                  << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
+                  << ", " << pads[3] << ", stride: " << strides[0] << ", "
+                  << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
                   << ", bias: " << (flag_bias ? "true" : "false")
                   << ", relu: " << (flag_relu ? "true" : "false")
                   << ", threads: " << th << ", power_mode: " << cls
@@ -278,30 +280,37 @@ TEST(TestConvRand, test_conv_transpose_rand) {
           for (auto& kw : {1, 2, 3}) {
             for (auto& kh : {1, 2, 3}) {
               for (auto& stride : {1, 2}) {
-                for (auto& pad : {0, 1, 2}) {
-                  for (auto& dila : {1, 2}) {
-                    for (auto& flag_bias : {false, true}) {
-                      for (auto& flag_relu : {false, true}) {
-                        if (cin % g != 0 || cout % g != 0) {
-                          continue;
-                        }
-                        std::vector<DDim> dims;
-                        DDim weights_dim({cin, cout / g, kh, kw});
-                        for (auto& batch : {1, 2}) {
-                          for (auto& h : {1, 3, 19, 32, 28}) {
-                            dims.push_back(DDim({batch, cin, h, h}));
+                for (auto& pad_h0 : {0, 1, 2}) {
+                  for (auto& pad_h1 : {0, 1, 2}) {
+                    for (auto& pad_w0 : {0, 1, 2}) {
+                      for (auto& pad_w1 : {0, 1, 2}) {
+                        for (auto& dila : {1, 2}) {
+                          for (auto& flag_bias : {false, true}) {
+                            for (auto& flag_relu : {false, true}) {
+                              if (cin % g != 0 || cout % g != 0) {
+                                continue;
+                              }
+                              std::vector<DDim> dims;
+                              DDim weights_dim({cin, cout / g, kh, kw});
+                              for (auto& batch : {1, 2}) {
+                                for (auto& h : {1, 3, 19, 32, 28}) {
+                                  dims.push_back(DDim({batch, cin, h, h}));
+                                }
+                              }
+                              test_conv_transpose_fp32(
+                                  dims,
+                                  weights_dim,
+                                  g,
+                                  {stride, stride},
+                                  {pad_h0, pad_h1, pad_w0, pad_w1},
+                                  {dila, dila},
+                                  flag_bias,
+                                  flag_relu,
+                                  {1, 4},
+                                  {FLAGS_power_mode});
+                            }
                           }
                         }
-                        test_conv_transpose_fp32(dims,
-                                                 weights_dim,
-                                                 g,
-                                                 {stride, stride},
-                                                 {pad, pad},
-                                                 {dila, dila},
-                                                 flag_bias,
-                                                 flag_relu,
-                                                 {1, 2, 4},
-                                                 {FLAGS_power_mode});
                       }
                     }
                   }
@@ -330,7 +339,7 @@ TEST(TestConvCustom, test_conv_transpose_fp32_custom_size) {
             FLAGS_kernel_w}),
       FLAGS_group,
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
       {FLAGS_dila_h, FLAGS_dila_w},
       FLAGS_flag_bias,
       FLAGS_flag_relu,
diff --git a/lite/tests/math/gemm_int8_compute_test.cc b/lite/tests/math/gemm_int8_compute_test.cc
index 06a1a0a65e1e5d0abb4a3eef2a6bf7d1e7ce5db0..fde5aacb1c1c21810c06a51eb6fa1f0cc4c3307a 100644
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -193,7 +193,7 @@ bool test_gemm_int8(bool tra,
     dbias_int8[l] = dbias[l] / scale_c[0];
   }
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(),
                                                db,
                                                dbias_int8,
@@ -206,21 +206,21 @@ bool test_gemm_int8(bool tra,
                                                trb,
                                                scale_merge_int8.data(),
                                                &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_int8 output: M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   /// fp32 output compute
-  t0.clear();
+  t0.Reset();
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(),
                                                db,
                                                dbias,
@@ -233,15 +233,15 @@ bool test_gemm_int8(bool tra,
                                                trb,
                                                scale_merge_fp32.data(),
                                                &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/gemv_int8_compute_test.cc b/lite/tests/math/gemv_int8_compute_test.cc
index c64e78d66a4193f1b20c525120d8b0281afc9a9c..8eab3109418540671f324ae0e46bd7b8d2b7a7db 100644
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -37,7 +37,7 @@ DEFINE_int32(power_mode,
 DEFINE_int32(threads, 1, "threads num");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
-DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(basic_test, true, "do all tests");
 DEFINE_bool(check_result, true, "check the result");
 
 DEFINE_int32(M, 512, "gemv: M");
@@ -165,7 +165,7 @@ bool test_gemv_int8(
     dbias_int8[l] = dbias[l] / scale_c[0];
   }
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemv_int8(da,
                                        db,
                                        dc_fp32,
@@ -177,21 +177,21 @@ bool test_gemv_int8(
                                        dbias,
                                        has_relu,
                                        &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemv_int8_int8 output: M: " << m << ", N: " << n
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   /// fp32 output compute
-  t0.clear();
+  t0.Reset();
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemv_int8(da,
                                        db,
                                        dc_int8,
@@ -203,15 +203,15 @@ bool test_gemv_int8(
                                        dbias_int8,
                                        has_relu,
                                        &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
@@ -285,7 +285,7 @@ TEST(TestLiteGemvInt8, gemv_prepacked_int8) {
     paddle::lite::DeviceInfo::Init();
 #endif
     LOG(INFO) << "run basic sgemm test";
-    for (auto& m : {1, 3, 8, 32, 397}) {
+    for (auto& m : {1, 3, 8, 32}) {  // ,397
       for (auto& n : {1, 3, 13, 141, 512, 789}) {
         for (auto& tra : {false}) {
           for (auto& has_bias : {false, true}) {
diff --git a/lite/tests/math/layout_compute_test.cc b/lite/tests/math/layout_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a566924548d6f3adac805eb80a574a9cd5c2afbf
--- /dev/null
+++ b/lite/tests/math/layout_compute_test.cc
@@ -0,0 +1,608 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/naive_math_impl.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+#ifdef LITE_WITH_ARM
+#include "lite/kernels/arm/layout_compute.h"
+#endif  // LITE_WITH_ARM
+
+DEFINE_int32(power_mode,
+             3,
+             "power mode: "
+             "0 for POWER_HIGH;"
+             "1 for POWER_LOW;"
+             "2 for POWER_FULL;"
+             "3 for NO_BIND");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(check_result, true, "check the result");
+
+DEFINE_int32(batch, 1, "batch size");
+DEFINE_int32(in_channel, 32, "input channel");
+DEFINE_int32(in_height, 112, "input height");
+DEFINE_int32(in_width, 112, "input width");
+
+DEFINE_bool(flag_nchw, true, "do nchw to nhwc");
+
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::operators::LayoutParam LayoutParam;
+
+using paddle::lite::profile::Timer;
+
+#define IN(n, c, h, w)                                 \
+  input_data[w + h * input_w + c * input_h * input_w + \
+             n * input_c * input_h * input_w]
+#define OUT(n, c, h, w)                                    \
+  output_data[w + h * output_w + c * output_h * output_w + \
+              n * output_c * output_h * output_w]
+
+template <typename Dtype>
+void nchw2nhwc_ref(const Tensor* input, Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_c = input->dims()[1];
+  int input_h = input->dims()[2];
+  int input_w = input->dims()[3];
+  int output_c = output->dims()[1];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, h, w, c) = IN(n, c, h, w);
+        }
+      }
+    }
+  }
+}
+#undef IN
+#undef OUT
+
+#define IN(n, h, w, c)                                 \
+  input_data[c + w * input_c + h * input_w * input_c + \
+             n * input_h * input_w * input_c]
+#define OUT(n, h, w, c)                                    \
+  output_data[c + w * output_c + h * output_w * output_c + \
+              n * output_h * output_w * output_c]
+template <typename Dtype>
+void nhwc2nchw_ref(const Tensor* input, Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_h = input->dims()[1];
+  int input_w = input->dims()[2];
+  int input_c = input->dims()[3];
+  int output_h = output->dims()[1];
+  int output_w = output->dims()[2];
+  int output_c = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, c, h, w) = IN(n, h, w, c);
+        }
+      }
+    }
+  }
+}
+
+#ifdef LITE_WITH_ARM
+void test_layout_fp32_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kFloat));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kFloat));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kFloat)> layout;
+      DDim dim_out({dim_in[0], dim_in[2], dim_in[3], dim_in[1]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(
+          *(const_cast<Tensor*>(param.x)), -1.f, 1.f);
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<float>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kFloat));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0.f);
+        auto dout_basic = tout_basic.mutable_data<float>();
+        nchw2nhwc_ref<float>(param.x, &tout_basic);
+      }
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kFloat));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test fp32 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test fp32 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+void test_layout_fp32_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kFloat));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kFloat));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kFloat)> layout;
+      // n h w c == n c h w
+      DDim dim_out({dim_in[0], dim_in[3], dim_in[1], dim_in[2]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(
+          *(const_cast<Tensor*>(param.x)), -1.f, 1.f);
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<float>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kFloat));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0.f);
+        auto dout_basic = tout_basic.mutable_data<float>();
+        nhwc2nchw_ref<float>(param.x, &tout_basic);
+      }
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kFloat));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test fp32 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test fp32 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+void test_layout_int8_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kInt8));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kInt8));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kInt8)> layout;
+      DDim dim_out({dim_in[0], dim_in[2], dim_in[3], dim_in[1]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(*(const_cast<Tensor*>(param.x)));
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<int8_t>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kInt8));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0);
+        auto dout_basic = tout_basic.mutable_data<int8_t>();
+        nchw2nhwc_ref<int8_t>(param.x, &tout_basic);
+      }
+      LOG(INFO) << "saber compute";
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      LOG(INFO) << "saber compute end";
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kInt8));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test int8 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test int8 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+void test_layout_int8_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kInt8));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kInt8));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kInt8)> layout;
+      // n h w c == n c h w
+      DDim dim_out({dim_in[0], dim_in[3], dim_in[1], dim_in[2]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(*(const_cast<Tensor*>(param.x)));
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<int8_t>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kInt8));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0.f);
+        auto dout_basic = tout_basic.mutable_data<int8_t>();
+        nhwc2nchw_ref<int8_t>(param.x, &tout_basic);
+      }
+      LOG(INFO) << "saber compute";
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      LOG(INFO) << "run";
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kInt8));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test int8 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test int8 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+#else
+void test_layout_fp32_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+void test_layout_fp32_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+void test_layout_int8_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+void test_layout_int8_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+#endif  // LITE_WITH_ARM
+
+#if 1  //
+TEST(TestLayout, test_Layout_fp32) {
+  if (FLAGS_basic_test) {
+    for (auto n : {1, 3}) {
+      for (auto c : {1, 3, 5, 32}) {
+        for (auto h : {3, 16, 20, 32}) {
+          for (auto w : {3, 4, 32, 112}) {
+            for (auto nchw2nhwc : {true, false}) {
+              DDim dim_in({n, c, h, w});
+              if (nchw2nhwc) {
+                LOG(INFO) << "NCHW2NHWC";
+                test_layout_fp32_nchw(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              } else {
+                LOG(INFO) << "NHWC2NCHW";
+                test_layout_fp32_nhwc(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 1
+TEST(TestLayout, test_Layout_int8) {
+  if (FLAGS_basic_test) {
+    for (auto n : {1, 3}) {
+      for (auto c : {1, 3, 5, 32}) {
+        for (auto h : {3, 16, 20, 32}) {
+          for (auto w : {3, 4, 32, 112}) {
+            for (auto nchw2nhwc : {true, false}) {
+              DDim dim_in({n, c, h, w});
+              if (nchw2nhwc) {
+                LOG(INFO) << "NCHW2NHWC int8";
+                test_layout_int8_nchw(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              } else {
+                LOG(INFO) << "NHWC2NCHW int8";
+                test_layout_int8_nhwc(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
+#if 1  /// custom
+TEST(TestLayoutCustom, test_Layout_custom_size) {
+  test_layout_fp32_nchw(
+      {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})},
+      true,
+      {FLAGS_threads},
+      {FLAGS_power_mode});
+}
+#endif  // custom
diff --git a/lite/tests/math/pool_compute_test.cc b/lite/tests/math/pool_compute_test.cc
index 9f4a9435945f8478a9285a56f03b20e941b3f8d7..e0d4de61747d5772edd94f7ad66cfe99e8cf0457 100644
--- a/lite/tests/math/pool_compute_test.cc
+++ b/lite/tests/math/pool_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/pool_compute.h"
@@ -60,7 +60,7 @@ DEFINE_string(pooling_type, "max", "do max pooling");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::PoolParam PoolParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::PoolParam& param) {
@@ -69,8 +69,7 @@ DDim compute_out_dim(const DDim& dim_in,
   auto kernel_w = param.ksize[1];
   auto h = dim_in[2];
   auto w = dim_in[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
   bool ceil_mode = param.ceil_mode;
@@ -79,11 +78,15 @@ DDim compute_out_dim(const DDim& dim_in,
   int wout = 1;
   if (!flag_global) {
     if (!ceil_mode) {
-      hout = (h - kernel_h + 2 * pad_h) / stride_h + 1;
-      wout = (w - kernel_w + 2 * pad_w) / stride_w + 1;
+      hout = (h - kernel_h + paddings[0] + paddings[1]) / stride_h + 1;
+      wout = (w - kernel_w + paddings[2] + paddings[3]) / stride_w + 1;
     } else {
-      hout = (h - kernel_h + 2 * pad_h + stride_h - 1) / stride_h + 1;
-      wout = (w - kernel_w + 2 * pad_w + stride_w - 1) / stride_w + 1;
+      hout =
+          (h - kernel_h + paddings[0] + paddings[1] + stride_h - 1) / stride_h +
+          1;
+      wout =
+          (w - kernel_w + paddings[2] + paddings[3] + stride_w - 1) / stride_w +
+          1;
     }
   }
   dim_out[2] = hout;
@@ -116,7 +119,7 @@ void pooling_basic(const float* din,
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int size_channel_in = win * hin;
   int size_channel_out = wout * hout;
   if (global_pooling) {
@@ -195,18 +198,22 @@ void pooling_basic(const float* din,
                 int bh = kernel_h;
                 int bw = kernel_w;
                 if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                   bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                     bw += pad_w;
                   }
                 }
                 if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                   bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                     bh += pad_h;
                   }
                 }
@@ -243,7 +250,7 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
   param.ksize = ksize;
 
   param.strides = strides;
-  param.paddings = pads;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
   param.ceil_mode = ceil_mode;
   param.global_pooling = flag_global;
   param.pooling_type = pooling_type;
@@ -313,18 +320,18 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           pool.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         double gops = 2.0 * dim_out.production() * ksize[0] * ksize[1];
         LOG(INFO) << "pool fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ", running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ", running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
@@ -348,7 +355,8 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test fp32 pool: input: " << dim_in
                          << ", output: " << dim_out
                          << ", kernel dim: " << ksize[0] << ", " << ksize[1]
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", global_pooling: "
                          << (flag_global ? "global" : "false")
@@ -363,6 +371,7 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
         LOG(INFO) << "test fp32 pool: input: " << dim_in
                   << ", output: " << dim_out << ", kernel dim: " << ksize[0]
                   << ", " << ksize[1] << ", pad: " << pads[0] << ", " << pads[1]
+                  << ", " << pads[2] << ", " << pads[3]
                   << ", stride: " << strides[0] << ", " << strides[1]
                   << ", global_pooling: " << (flag_global ? "global" : "false")
                   << ", pooling_type: " << pooling_type
@@ -399,31 +408,38 @@ TEST(TestPoolRand, test_pool_rand) {
       for (auto& kw : {1, 2, 3}) {
         for (auto& kh : {1, 2, 3}) {
           for (auto& stride : {1, 2}) {
-            for (auto& pad : {0, 1, 2}) {
-              for (auto& flag_global : {false, true}) {
-                for (auto& exclusive : {false, true}) {
-                  for (auto& ceil_mode : {false, true}) {
-                    for (auto& pooling_type : {"max", "avg"}) {
-                      bool adaptive = false;
-                      bool use_quantizer = false;
-                      std::vector<DDim> dims;
-                      for (auto& batch : {1, 2}) {
-                        for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) {
-                          dims.push_back(DDim({batch, cin, h, h}));
+            for (auto& pad_top : {0, 1, 2}) {
+              for (auto& pad_bottom : {0, 1, 2}) {
+                for (auto& pad_left : {0, 1, 2}) {
+                  for (auto& pad_right : {0, 1, 2}) {
+                    for (auto& flag_global : {false, true}) {
+                      for (auto& exclusive : {false, true}) {
+                        for (auto& ceil_mode : {false, true}) {
+                          for (auto& pooling_type : {"max", "avg"}) {
+                            bool adaptive = false;
+                            bool use_quantizer = false;
+                            std::vector<DDim> dims;
+                            for (auto& batch : {1, 2}) {
+                              for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) {
+                                dims.push_back(DDim({batch, cin, h, h}));
+                              }
+                            }
+                            test_pool_fp32(
+                                dims,
+                                {kh, kw},
+                                {stride, stride},
+                                {pad_top, pad_bottom, pad_left, pad_right},
+                                ceil_mode,
+                                flag_global,
+                                exclusive,
+                                adaptive,
+                                use_quantizer,
+                                pooling_type,
+                                {1, 2, 4},
+                                {FLAGS_power_mode});
+                          }
                         }
                       }
-                      test_pool_fp32(dims,
-                                     {kh, kw},
-                                     {stride, stride},
-                                     {pad, pad},
-                                     ceil_mode,
-                                     flag_global,
-                                     exclusive,
-                                     adaptive,
-                                     use_quantizer,
-                                     pooling_type,
-                                     {1, 2, 4},
-                                     {FLAGS_power_mode});
                     }
                   }
                 }
@@ -443,7 +459,7 @@ TEST(TesPoolCustom, test_pool_fp32_custom_size) {
       {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})},
       {FLAGS_kernel_h, FLAGS_kernel_w},
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
       FLAGS_ceil_mode,
       FLAGS_flag_global,
       FLAGS_exclusive,
diff --git a/lite/tests/math/sgemm_c4_compute_test.cc b/lite/tests/math/sgemm_c4_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e5577e03075502bab30aa03a50241b817fa8742
--- /dev/null
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
@@ -0,0 +1,236 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "lite/tests/utils/fill_data.h"
+#include "lite/tests/utils/naive_math_impl.h"
+#ifdef LITE_WITH_ARM
+#include "lite/backends/arm/math/funcs.h"
+#endif  // LITE_WITH_ARM
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::Tensor Tensor;
+using paddle::lite::profile::Timer;
+
+DEFINE_int32(power_mode,
+             3,
+             "power mode: "
+             "0 for POWER_HIGH;"
+             "1 for POWER_LOW;"
+             "2 for POWER_FULL;"
+             "3 for NO_BIND");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_bool(basic_test, true, "do all tests");
+DEFINE_bool(check_result, true, "check the result");
+
+DEFINE_int32(M, 512, "gemm_c4: M");
+DEFINE_int32(N, 512, "gemm_c4: N");
+DEFINE_int32(K, 512, "gemm_c4: K");
+
+DEFINE_bool(flag_relu, false, "do relu");
+DEFINE_bool(flag_bias, false, "with bias");
+
+bool test_sgemm_c4(
+    int m, int n, int k, bool has_bias, bool has_relu, int cls, int ths) {
+  int m_round = (m + 3) / 4 * 4;
+  int k_round = (k + 3) / 4 * 4;
+  int size_a = m * k;
+  int size_b = n * k;
+  int size_a_c4 = m_round * k_round;
+  int size_b_c4 = k_round * n;
+
+  Tensor ta;
+  Tensor tb;
+  Tensor ta_c4;
+  Tensor tb_c4;
+  Tensor tc;
+  Tensor tc_basic;
+  Tensor tc_backup;
+  Tensor tbias;
+
+  ta.Resize({size_a});
+  tb.Resize({size_b});
+  ta_c4.Resize({size_a_c4});
+  tb_c4.Resize({size_b_c4});
+  tc.Resize({m_round * n});
+  tc_basic.Resize({m_round * n});
+  tbias.Resize({m});
+
+  ta.set_precision(PRECISION(kFloat));
+  tb.set_precision(PRECISION(kFloat));
+  ta_c4.set_precision(PRECISION(kFloat));
+  tb_c4.set_precision(PRECISION(kFloat));
+  tc.set_precision(PRECISION(kFloat));
+  tc_basic.set_precision(PRECISION(kFloat));
+  tbias.set_precision(PRECISION(kFloat));
+
+  fill_tensor_rand(ta, -1.f, 1.f);
+  fill_tensor_rand(tb, -1.f, 1.f);
+  fill_tensor_rand(tbias, -1.f, 1.f);
+  fill_tensor_rand(tc, -1.f, 1.f);
+
+  auto da = ta.mutable_data<float>();
+  auto db = tb.mutable_data<float>();
+  auto da_c4 = ta_c4.mutable_data<float>();
+  auto db_c4 = tb_c4.mutable_data<float>();
+  auto dc_basic = tc_basic.mutable_data<float>();
+  auto dbias = tbias.mutable_data<float>();
+
+  // trans A, B to c4
+  basic_trans_mat_to_c4(da, da_c4, k, m, k, true);
+  basic_trans_mat_to_c4(db, db_c4, n, k, n, false);
+
+  LOG(INFO) << "sgemm_c4 M: " << m << ", N: " << n << ", K: " << k
+            << ", relu: " << (has_relu ? "true" : "false")
+            << ", bias: " << (has_bias ? "true" : "false");
+
+  if (FLAGS_check_result) {
+    basic_gemm_c4(false,
+                  false,
+                  m,
+                  n,
+                  k,
+                  1.f,
+                  da,
+                  k,
+                  db,
+                  n,
+                  0.f,
+                  dc_basic,
+                  n,
+                  dbias,
+                  has_bias,
+                  has_relu);
+  }
+  Timer t0;
+#ifdef LITE_WITH_ARM
+  //! compute
+  double ops = 2.0 * m_round * n * k_round;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), ths);
+  auto dc = tc.mutable_data<float>();
+  for (int j = 0; j < FLAGS_warmup; ++j) {
+    paddle::lite::arm::math::sgemm_prepack_c4(
+        m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx);
+  }
+
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    t0.Start();
+    paddle::lite::arm::math::sgemm_prepack_c4(
+        m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx);
+    t0.Stop();
+  }
+  LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k
+            << ", power_mode: " << cls << ", threads: " << ths
+            << ", GOPS: " << ops * 1e-9f
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
+            << " GOPs";
+
+  if (FLAGS_check_result) {
+    double max_ratio = 0;
+    double max_diff = 0;
+    tensor_cmp_host(tc_basic, tc, max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff
+              << ", max ratio: " << max_ratio;
+    if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) {
+      Tensor tdiff;
+      tdiff.set_precision(PRECISION(kFloat));
+      tdiff.Resize(tc.dims());
+      tensor_diff(tc_basic, tc, tdiff);
+      LOG(INFO) << "a: ";
+      print_tensor(ta);
+      LOG(INFO) << "a_c4: ";
+      print_tensor(ta_c4);
+      LOG(INFO) << "b: ";
+      print_tensor(tb);
+      LOG(INFO) << "b_c4: ";
+      print_tensor(tb_c4);
+      LOG(INFO) << "basic result: ";
+      print_tensor(tc_basic);
+      LOG(INFO) << "lite result: ";
+      print_tensor(tc);
+      LOG(INFO) << "diff result: ";
+      print_tensor(tdiff);
+      return false;
+    }
+  }
+#endif
+  return true;
+}
+
+TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) {
+  if (FLAGS_basic_test) {
+#ifdef LITE_WITH_ARM
+    paddle::lite::DeviceInfo::Init();
+#endif
+    LOG(INFO) << "run basic sgemm_c4 test";
+    for (auto& m : {1, 3, 8, 32, 397}) {
+      for (auto& n : {1, 2, 3, 4, 13, 141, 789}) {
+        for (auto& k : {1, 3, 8, 59, 234}) {
+          for (auto& has_bias : {false, true}) {
+            for (auto& has_relu : {false, true}) {
+              for (auto& th : {1, 2, 4}) {
+                auto flag = test_sgemm_c4(
+                    m, n, k, has_bias, has_relu, FLAGS_power_mode, th);
+                if (flag) {
+                  LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k
+                            << ", bias: " << (has_bias ? "true" : "false")
+                            << ", relu: " << (has_relu ? "true" : "false")
+                            << " passed\n";
+                } else {
+                  LOG(FATAL) << "test m = " << m << ", n=" << n << ", k=" << k
+                             << ", bias: " << (has_bias ? "true" : "false")
+                             << ", relu: " << (has_relu ? "true" : "false")
+                             << " failed\n";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(TestSgemmC4Custom, test_func_sgemm_c4_prepacked_custom) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  auto flag = test_sgemm_c4(FLAGS_M,
+                            FLAGS_N,
+                            FLAGS_K,
+                            FLAGS_flag_bias,
+                            FLAGS_flag_relu,
+                            FLAGS_power_mode,
+                            FLAGS_threads);
+  if (!flag) {
+    LOG(FATAL) << "test m = " << FLAGS_M << ", n=" << FLAGS_N
+               << ", k=" << FLAGS_K << ", bias: " << FLAGS_flag_bias
+               << ", relu: " << FLAGS_flag_relu << " failed!!";
+  }
+  LOG(INFO) << "test m = " << FLAGS_M << ", n=" << FLAGS_N << ", k=" << FLAGS_K
+            << ", bias: " << FLAGS_flag_bias << ", relu: " << FLAGS_flag_relu
+            << " passed!!";
+}
diff --git a/lite/tests/math/sgemm_compute_test.cc b/lite/tests/math/sgemm_compute_test.cc
index 1621ceb9047125d0d2a4141a01111eb54892dee9..8295cef3416252a12c84829e4ef98208cec6f2a1 100644
--- a/lite/tests/math/sgemm_compute_test.cc
+++ b/lite/tests/math/sgemm_compute_test.cc
@@ -20,12 +20,14 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+typedef paddle::lite::operators::ActivationParam ActivationParam;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -136,6 +138,12 @@ bool test_sgemm(bool tra,
                has_relu);
   }
   Timer t0;
+  ActivationParam act_param;
+  if (has_relu) {
+    act_param.has_active = true;
+    act_param.active_type =
+        (paddle::lite_api::ActivationType)1;  // 2-relu6 4-leakyrelu
+  }
 #ifdef LITE_WITH_ARM
   //! compute
   double ops = 2.0 * m * n * k;
@@ -163,7 +171,7 @@ bool test_sgemm(bool tra,
                                            ldc,
                                            dbias,
                                            has_bias,
-                                           has_relu,
+                                           act_param,
                                            &ctx);
   }
 
@@ -171,7 +179,7 @@ bool test_sgemm(bool tra,
     if (i == FLAGS_repeats - 1) {
       memcpy(dc, dc_backup, sizeof(float) * m * ldc);
     }
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::sgemm_prepack(trb,
                                            m,
                                            n,
@@ -184,17 +192,17 @@ bool test_sgemm(bool tra,
                                            ldc,
                                            dbias,
                                            has_bias,
-                                           has_relu,
+                                           act_param,
                                            &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/sgemv_compute_test.cc b/lite/tests/math/sgemv_compute_test.cc
index 3c8965cb2c3d92ecf46f583e08b7ec38754418f8..91a1fe1770dfa3eeb3f3b94fcd2361f1c1634b1e 100644
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
@@ -20,9 +20,9 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
 
@@ -38,11 +38,19 @@ DEFINE_int32(K, 512, "sgemv: K");
 
 DEFINE_bool(traA, false, "gemv: A transpose");
 
-DEFINE_bool(flag_relu, false, "do relu");
+DEFINE_int32(flag_act, 0, "do act");
 DEFINE_bool(flag_bias, false, "with bias");
-
-bool test_sgemv(
-    bool tra, int m, int k, bool has_bias, bool has_relu, int cls, int ths) {
+DEFINE_double(leakey_relu_alpha, 1.0, "leakey relu alpha");
+DEFINE_double(clipped_coef, 6.0, "clipped relu coef");
+bool test_sgemv(bool tra,
+                int m,
+                int k,
+                bool has_bias,
+                int flag_act,
+                int cls,
+                int ths,
+                float six = 6.f,
+                float alpha = 1.f) {
   Tensor ta;
   Tensor tb;
   Tensor tc;
@@ -68,8 +76,7 @@ bool test_sgemv(
   fill_tensor_rand(tbias, -1.f, 1.f);
 
   LOG(INFO) << "sgemv M: " << m << ", K: " << k
-            << ", transA: " << (tra ? "true" : "false")
-            << ", relu: " << (has_relu ? "true" : "false")
+            << ", transA: " << (tra ? "true" : "false") << ", act: " << flag_act
             << ", bias: " << (has_bias ? "true" : "false");
 #ifdef LITE_WITH_ARM
 
@@ -78,12 +85,31 @@ bool test_sgemv(
   auto dc = tc.mutable_data<float>();
   auto dc_basic = tc_basic.mutable_data<float>();
   auto dbias = tbias.mutable_data<float>();
-
+  paddle::lite_api::ActivationType act =
+      paddle::lite_api::ActivationType::kIndentity;
+  if (flag_act == 1) {
+    act = paddle::lite_api::ActivationType::kRelu;
+  } else if (flag_act == 2) {
+    act = paddle::lite_api::ActivationType::kRelu6;
+  } else if (flag_act == 4) {
+    act = paddle::lite_api::ActivationType::kLeakyRelu;
+  }
   if (FLAGS_check_result) {
-    basic_gemv(
-        m, k, da, db, dbias, dc_basic, 1.f, 0.f, tra, has_bias, has_relu);
+    basic_gemv(m,
+               k,
+               da,
+               db,
+               dbias,
+               dc_basic,
+               1.f,
+               0.f,
+               tra,
+               has_bias,
+               flag_act,
+               six,
+               alpha);
   }
-  paddle::lite::Timer t0;
+  paddle::lite::profile::Timer t0;
   //! compute
   double ops = 2.0 * m * k;
   std::unique_ptr<paddle::lite::KernelContext> ctx1(
@@ -92,23 +118,45 @@ bool test_sgemv(
   ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), ths);
   /// warmup
   for (int j = 0; j < FLAGS_warmup; ++j) {
-    paddle::lite::arm::math::sgemv(
-        da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
+    paddle::lite::arm::math::sgemv(da,
+                                   db,
+                                   dc,
+                                   tra,
+                                   m,
+                                   k,
+                                   has_bias,
+                                   dbias,
+                                   flag_act > 0,
+                                   act,
+                                   &ctx,
+                                   six,
+                                   alpha);
   }
 
-  t0.clear();
+  t0.Reset();
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
-    paddle::lite::arm::math::sgemv(
-        da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
-    t0.end();
+    t0.Start();
+    paddle::lite::arm::math::sgemv(da,
+                                   db,
+                                   dc,
+                                   tra,
+                                   m,
+                                   k,
+                                   has_bias,
+                                   dbias,
+                                   flag_act > 0,
+                                   act,
+                                   &ctx,
+                                   six,
+                                   alpha);
+    t0.Stop();
   }
   LOG(INFO) << "gemv output: M: " << m << ", K: " << k << ", cluster: " << cls
             << ", threads: " << ths << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
@@ -125,7 +173,7 @@ bool test_sgemv(
       tensor_diff(tc_basic, tc, tdiff);
       LOG(INFO) << "basic result: ";
       print_tensor(tc_basic);
-      LOG(INFO) << "saber result: ";
+      LOG(INFO) << "lite result: ";
       print_tensor(tc);
       LOG(INFO) << "diff result: ";
       print_tensor(tdiff);
@@ -144,22 +192,31 @@ TEST(TestLiteSgemv, Sgemv) {
     LOG(INFO) << "run basic sgemv test";
     for (auto& m : {1, 3, 8, 21, 32, 397}) {
       for (auto& k : {1, 3, 8, 17, 59, 234}) {
-        for (auto& tra : {true, false}) {
+        for (auto& tra : {false, true}) {
           for (auto& has_bias : {false, true}) {
-            for (auto& has_relu : {false, true}) {
+            for (auto& flag_act : {0, 1, 2, 4}) {
               for (auto& th : {1, 2, 4}) {
-                auto flag = test_sgemv(
-                    tra, m, k, has_bias, has_relu, FLAGS_cluster, th);
+                float six = 6.f;
+                float alpha = 8.88f;
+                auto flag = test_sgemv(tra,
+                                       m,
+                                       k,
+                                       has_bias,
+                                       flag_act,
+                                       FLAGS_cluster,
+                                       th,
+                                       six,
+                                       alpha);
                 if (flag) {
                   LOG(INFO) << "test m = " << m << ", k=" << k
                             << ", bias: " << (has_bias ? "true" : "false")
-                            << ", relu: " << (has_relu ? "true" : "false")
+                            << ", flag act: " << flag_act
                             << ", trans A: " << (tra ? "true" : "false")
                             << ", threads: " << th << " passed\n";
                 } else {
                   LOG(FATAL) << "test m = " << m << ", k=" << k
                              << ", bias: " << (has_bias ? "true" : "false")
-                             << ", relu: " << (has_relu ? "true" : "false")
+                             << ", flag_act: " << flag_act
                              << ", trans A: " << (tra ? "true" : "false")
                              << ", threads: " << th << " failed\n";
                 }
@@ -180,15 +237,17 @@ TEST(TestSgemvCustom, Sgemv_custom) {
                          FLAGS_M,
                          FLAGS_K,
                          FLAGS_flag_bias,
-                         FLAGS_flag_relu,
+                         FLAGS_flag_act,
                          FLAGS_cluster,
-                         FLAGS_threads);
+                         FLAGS_threads,
+                         FLAGS_clipped_coef,
+                         FLAGS_leakey_relu_alpha);
   if (!flag) {
     LOG(FATAL) << "test m = " << FLAGS_M << ", k=" << FLAGS_K
                << ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias
-               << ", relu: " << FLAGS_flag_relu << " failed!!";
+               << ", act: " << FLAGS_flag_act << " failed!!";
   }
   LOG(INFO) << "test m = " << FLAGS_M << ", k=" << FLAGS_K
             << ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias
-            << ", relu: " << FLAGS_flag_relu << " passed!!";
+            << ", act: " << FLAGS_flag_act << " passed!!";
 }
diff --git a/lite/tests/utils/naive_math_impl.h b/lite/tests/utils/naive_math_impl.h
index 846126ac247ee685bd8772ede87635c45b52f79a..e5ef77ca061d31a0b9b735d49cda9bbeda53c294 100644
--- a/lite/tests/utils/naive_math_impl.h
+++ b/lite/tests/utils/naive_math_impl.h
@@ -14,6 +14,108 @@
 
 #pragma once
 
+template <typename type>
+static void basic_trans_mat_to_c4(const type* input,
+                                  type* output,
+                                  const int ldin,
+                                  const int M,
+                                  const int K,
+                                  bool pack_k) {
+  const int m_round = (M + 3) / 4 * 4;
+  int k_round = (K + 3) / 4 * 4;
+  if (!pack_k) {
+    k_round = K;
+  }
+  const int m_loop = m_round / 4;
+  type zero_buf[K];
+  memset(zero_buf, 0, K * sizeof(type));
+  for (int i = 0; i < m_loop; ++i) {
+    const type* in0 = input + i * 4 * ldin;
+    const type* in1 = in0 + ldin;
+    const type* in2 = in1 + ldin;
+    const type* in3 = in2 + ldin;
+    if (4 * (i + 1) - M > 0) {
+      switch (4 * (i + 1) - M) {
+        case 3:
+          in1 = zero_buf;
+        case 2:
+          in2 = zero_buf;
+        case 1:
+          in3 = zero_buf;
+        default:
+          break;
+      }
+    }
+    for (int j = 0; j < K; ++j) {
+      *output++ = *in0++;
+      *output++ = *in1++;
+      *output++ = *in2++;
+      *output++ = *in3++;
+    }
+    for (int j = K; j < k_round; ++j) {
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+    }
+  }
+}
+
+template <typename type, typename type2>
+static void basic_gemm_c4(bool trans_a,
+                          bool trans_b,
+                          int m,
+                          int n,
+                          int k,
+                          type2 alpha,
+                          const type* a,
+                          int lda,
+                          const type* b,
+                          int ldb,
+                          type2 beta,
+                          type2* c,
+                          int ldc,
+                          const type2* bias,
+                          bool flag_bias = false,
+                          bool flag_relu = false) {
+  type2* tmp_c = reinterpret_cast<type2*>(malloc(m * ldc * sizeof(type2)));
+  memset(tmp_c, 0, m * ldc * sizeof(type2));
+#pragma omp parallel for
+  for (int i = 0; i < m; ++i) {
+    auto bias_data = static_cast<type2>(0);
+    if (flag_bias) {
+      bias_data = bias[i];
+    }
+    for (int j = 0; j < n; ++j) {
+      auto sum = static_cast<type2>(0);
+      for (int l = 0; l < k; ++l) {
+        type av;
+        type bv;
+        if (trans_a) {
+          av = a[l * lda + i];
+        } else {
+          av = a[i * lda + l];
+        }
+        if (trans_b) {
+          bv = b[j * ldb + l];
+        } else {
+          bv = b[l * ldb + j];
+        }
+        sum += av * bv;
+      }
+      type2 tmp = alpha * sum + beta * tmp_c[i * ldc + j] + bias_data;
+      if (flag_relu) {
+        tmp_c[i * ldc + j] = tmp > (type2)0 ? tmp : (type2)0;
+      } else {
+        tmp_c[i * ldc + j] = tmp;
+      }
+    }
+  }
+  //! trans c to c4
+  basic_trans_mat_to_c4(tmp_c, c, ldc, m, n, false);
+  free(tmp_c);
+}
+
 template <typename type, typename type2>
 static void basic_gemm(bool trans_a,
                        bool trans_b,
@@ -75,7 +177,9 @@ static void basic_gemv(int m,
                        type2 beta,
                        bool trans_a = false,
                        bool flag_bias = false,
-                       bool flag_relu = false) {
+                       int flag_act = false,
+                       float six = 6.f,
+                       float leakey_relu_alpha = 1.f) {
 #pragma omp parallel for
   for (int i = 0; i < m; ++i) {
     auto bias_data = static_cast<type2>(0);
@@ -93,8 +197,15 @@ static void basic_gemv(int m,
       sum += av * b[j];
     }
     type2 tmp = alpha * sum + beta * c[i] + bias_data;
-    if (flag_relu) {
-      c[i] = tmp > (type2)0 ? tmp : (type2)0;
+    if (flag_act > 0) {
+      if (flag_act == 1) {  // relu
+        c[i] = tmp > (type2)0 ? tmp : (type2)0;
+      } else if (flag_act == 2) {  // relu 6
+        c[i] = tmp > (type2)0 ? tmp : (type2)0;
+        c[i] = c[i] < six ? c[i] : six;
+      } else if (flag_act == 4) {  // leakey relu
+        c[i] = tmp < (type2)0 ? (type2)(tmp * leakey_relu_alpha) : tmp;
+      }
     } else {
       c[i] = tmp;
     }
@@ -128,7 +239,9 @@ static void conv_basic(const Dtype1* din,
                        int pad_w,
                        int pad_h,
                        bool flag_bias,
-                       bool flag_relu) {
+                       int act_type,
+                       float six = 6.f,
+                       float scale = 1.f) {
   Dtype2 beta = 0;
   auto src_data = din;
   auto dst_data_ref = dout;
@@ -178,10 +291,27 @@ static void conv_basic(const Dtype1* din,
                 }
               }
             }
-            if (flag_relu) {
-              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
-                                          ? dst_data_ref[out_idx]
-                                          : (Dtype2)0;
+            if (act_type > 0) {
+              // 1-relu 2-relu6 4-leakyrelu
+              if (act_type == 1) {
+                dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
+                                            ? dst_data_ref[out_idx]
+                                            : (Dtype2)0;
+              } else if (act_type == 2) {
+                dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
+                                            ? dst_data_ref[out_idx]
+                                            : (Dtype2)0;
+                dst_data_ref[out_idx] = dst_data_ref[out_idx] < (Dtype2)six
+                                            ? dst_data_ref[out_idx]
+                                            : (Dtype2)six;
+              } else if (act_type == 4) {
+                dst_data_ref[out_idx] =
+                    dst_data_ref[out_idx] > (Dtype2)0
+                        ? dst_data_ref[out_idx]
+                        : (Dtype2)(dst_data_ref[out_idx] * scale);
+              } else {
+                printf("this act type: %d does not support \n", act_type);
+              }
             }
           }
         }
@@ -228,8 +358,10 @@ static void col2im(const Dtype* data_col,
                    const int width,
                    const int kernel_h,
                    const int kernel_w,
-                   const int pad_h,
-                   const int pad_w,
+                   const int pad_h0,
+                   const int pad_h1,
+                   const int pad_w0,
+                   const int pad_w1,
                    const int stride_h,
                    const int stride_w,
                    const int dilation_h,
@@ -237,21 +369,24 @@ static void col2im(const Dtype* data_col,
                    Dtype* data_im) {
   memset(data_im, 0, height * width * channels * sizeof(Dtype));
   const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
   const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
   const int channel_size = height * width;
 
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_h0 + kernel_row * dilation_h;
 
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             data_col += output_w;
           } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_w0 + kernel_col * dilation_w;
 
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
@@ -289,8 +424,10 @@ void deconv_basic(const Dtype1* din,
                   int stride_h,
                   int dila_w,
                   int dila_h,
-                  int pad_w,
-                  int pad_h,
+                  int pad_w0,
+                  int pad_w1,
+                  int pad_h0,
+                  int pad_h1,
                   bool flag_bias,
                   bool flag_relu) {
   int m = chout * kernel_w * kernel_h / group;
@@ -298,12 +435,12 @@ void deconv_basic(const Dtype1* din,
   int k = chin / group;
 
   int group_size_in = win * hin * chin / group;
-  int group_size_out = wout * hout * chout / group;
   int group_size_coldata = m * n;
   int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
   bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
+                      (stride_w == 1) && (pad_w0 == 0) && (pad_h0 == 0) &&
+                      (pad_w1 == 0) && (pad_h1 == 0) && (dila_w == 1) &&
+                      (dila_h == 1);
 
   Dtype2* workspace_ptr =
       static_cast<Dtype2*>(malloc(sizeof(float) * m * n * group));
@@ -316,7 +453,7 @@ void deconv_basic(const Dtype1* din,
     if (flag_1x1s1p1) {
       col_data = dout_batch;
     }
-    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata);
+    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata * group);
     for (int g = 0; g < group; ++g) {
       const Dtype1* din_group = din_batch + g * group_size_in;
       const Dtype1* weights_group = weights + g * group_size_weights;
@@ -346,8 +483,10 @@ void deconv_basic(const Dtype1* din,
              wout,
              kernel_h,
              kernel_w,
-             pad_h,
-             pad_w,
+             pad_h0,
+             pad_h1,
+             pad_w0,
+             pad_w1,
              stride_h,
              stride_w,
              dila_h,
diff --git a/lite/tests/utils/timer.h b/lite/tests/utils/timer.h
deleted file mode 100644
index 095f32046e0dc5b9342163e1f4f13f4e30c10670..0000000000000000000000000000000000000000
--- a/lite/tests/utils/timer.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <chrono>  // NOLINT
-#include <list>
-
-namespace paddle {
-namespace lite {
-
-class Timer final {
- public:
-  Timer() {}
-
-  ~Timer() {}
-
-  void clear() { ms_time_.clear(); }
-
-  void start() { tstart_ = std::chrono::system_clock::now(); }
-
-  void end() {
-    tend_ = std::chrono::system_clock::now();
-    auto ts =
-        std::chrono::duration_cast<std::chrono::microseconds>(tend_ - tstart_);
-    latest_time_ = 1000.f * static_cast<float>(ts.count()) *
-                   std::chrono::microseconds::period::num /
-                   std::chrono::microseconds::period::den;
-    ms_time_.push_back(latest_time_);
-  }
-
-  float latest_time() const { return latest_time_; }
-
-  float get_average_ms() {
-    if (ms_time_.size() == 0) {
-      return 0.f;
-    }
-    float sum = 0.f;
-    for (auto i : ms_time_) {
-      sum += i;
-    }
-    return sum / ms_time_.size();
-  }
-
-  float get_sum_ms() {
-    if (ms_time_.size() == 0) {
-      return 0.f;
-    }
-    float sum = 0.f;
-    for (auto i : ms_time_) {
-      sum += i;
-    }
-    return sum;
-  }
-
-  // return tile (0-99) time.
-  float get_tile_time(float tile) {
-    if (tile < 0 || tile > 100) {
-      return -1.f;
-    }
-    int total_items = static_cast<int>(ms_time_.size());
-    if (total_items <= 0) {
-      return -2.f;
-    }
-    ms_time_.sort();
-    int pos = static_cast<int>(tile * total_items / 100);
-    auto it = ms_time_.begin();
-    for (int i = 0; i < pos; ++i) {
-      ++it;
-    }
-    return *it;
-  }
-
-  std::list<float> get_time_stat() { return ms_time_; }
-
-  float get_min_time() {
-    ms_time_.sort();
-    return *ms_time_.begin();
-  }
-
-  float get_max_time() {
-    ms_time_.sort([](int a, int b) { return a > b; });
-    return *ms_time_.begin();
-  }
-
- private:
-  std::chrono::time_point<std::chrono::system_clock> tstart_;
-  std::chrono::time_point<std::chrono::system_clock> tend_;
-  std::list<float> ms_time_;
-  float latest_time_;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tools/Dockerfile.gcc82-cuda10.1-cudnn7 b/lite/tools/Dockerfile.gcc82-cuda10.1-cudnn7
new file mode 100644
index 0000000000000000000000000000000000000000..a8e4619a6c38a2ab05f0db2eae6617bc1dec2982
--- /dev/null
+++ b/lite/tools/Dockerfile.gcc82-cuda10.1-cudnn7
@@ -0,0 +1,92 @@
+# A image for paddle lite mobile cross compile and simulator on android
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y --no-install-recommends \
+        curl \
+        git \
+        make \
+        python \
+        python-pip \
+        python-dev \
+        unzip \
+        vim \
+        wget \
+        texinfo
+
+# gcc8.2
+WORKDIR /usr/bin
+RUN apt-get update && \
+   DEBIAN_FRONTEND=noninteractive apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+   libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+   xz-utils tk-dev libffi-dev liblzma-dev
+
+RUN wget http://ftp.tsukuba.wide.ad.jp/software/gcc/releases/gcc-8.2.0/gcc-8.2.0.tar.gz && \
+    tar xzvf gcc-8.2.0.tar.gz  && \
+    cd gcc-8.2.0/   &&\
+    wget http://ftp.tsukuba.wide.ad.jp/software/gcc/infrastructure/gmp-6.1.0.tar.bz2  && \
+    wget http://ftp.tsukuba.wide.ad.jp/software/gcc/infrastructure/mpc-1.0.3.tar.gz  && \
+    wget http://ftp.tsukuba.wide.ad.jp/software/gcc/infrastructure/mpfr-3.1.4.tar.bz2 && \
+    wget http://ftp.tsukuba.wide.ad.jp/software/gcc/infrastructure/isl-0.18.tar.bz2 && \
+    tar -jxvf gmp-6.1.0.tar.bz2 &&  ln -s gmp-6.1.0/ gmp && \
+    tar -xzvf mpc-1.0.3.tar.gz  && ln -s mpc-1.0.3/ mpc && \
+    tar -jxvf mpfr-3.1.4.tar.bz2 && ln -s mpfr-3.1.4/ mpfr && \
+    tar -jxvf isl-0.18.tar.bz2 && ln -s isl-0.18/ isl && \
+    cd ../ && mkdir gcc-bulid  && cd gcc-bulid/ && \
+    ../gcc-8.2.0/configure CFLAGS="-g3 -gdwarf-2 -O0" CXXFLAGS="-g3 -gdwarf-2 -O0" CFLAGS_FOR_TARGET="-g3  -gdwarf-2 -O0" CXXFLAGS_FOR_TARGET="-g3 -gdwarf-2 -O0" --disable-multilib --enable-languages=c,c++ --prefix=/usr/local/gcc-8.2 && \
+    make -j 4 && make install 
+    RUN rm gcc
+    RUN rm g++
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++
+    ENV PATH=/usr/local/gcc-8.2/bin:$PATH
+
+# for android simulator
+RUN apt-get install -y --no-install-recommends \
+        libc6-i386 \
+        lib32stdc++6 \
+        redir \
+        iptables \
+        openjdk-8-jre \
+        openjdk-8-jdk
+
+# for cmake 3.10
+RUN curl -O https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+        tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+        mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
+        rm -f /usr/bin/cmake && ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+        rm -f /usr/bin/ccmake && ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+
+# for arm linux compile
+RUN apt-get install -y --no-install-recommends \
+        g++-arm-linux-gnueabi \
+        gcc-arm-linux-gnueabi \
+        g++-arm-linux-gnueabihf \
+        gcc-arm-linux-gnueabihf \
+        gcc-aarch64-linux-gnu \
+        g++-aarch64-linux-gnu 
+
+# for android ndk17c
+RUN cd /tmp && curl -O https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
+ENV NDK_ROOT /opt/android-ndk-r17c
+RUN cd /opt && unzip /tmp/android-ndk-r17c-linux-x86_64.zip
+
+
+# Install arm gcc toolchains
+RUN apt-get install -y --no-install-recommends \
+  g++-arm-linux-gnueabi gcc-arm-linux-gnueabi \
+  g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf \
+  gcc-aarch64-linux-gnu g++-aarch64-linux-gnu 
+
+
+
+# Expose android port
+EXPOSE 5555
+EXPOSE 5557
+# VNC port
+EXPOSE 5900
+
+RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz
diff --git a/lite/tools/benchmark.sh b/lite/tools/benchmark.sh
index 683271fa8f5c97a39099429ed003ba7414de1132..23bb183ec9711a43def5636f15a9b17795f0ec24 100644
--- a/lite/tools/benchmark.sh
+++ b/lite/tools/benchmark.sh
@@ -2,13 +2,12 @@
 set -e
 
 # Check input
-if [ $# -lt  3 ];
+if [ $# -lt  2 ];
 then
     echo "Input error"
     echo "Usage:"
-    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename>"
-    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename> <is_run_model_optimize: [true|false]>"
-    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename> <is_run_model_optimize: [true|false]> <is_run_quantized_model: [trur|false]>"
+    echo "  sh benchmark.sh benchmark_bin_path benchmark_models_path <result_filename> <input_shape> <power_mode: [0|1|2|3]> <is_run_model_optimize: [true|false]> <is_run_quantized_model: [trur|false]>"
+    echo "\npower_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores,  3 for no bind."
     exit
 fi
 
@@ -16,8 +15,10 @@ fi
 ANDROID_DIR=/data/local/tmp
 BENCHMARK_BIN=$1
 MODELS_DIR=$2
-RESULT_FILENAME=$3
 
+RESULT_FILENAME=result.txt
+INPUT_SHAPE=1,3,244,244
+POWER_MODE=3
 WARMUP=10
 REPEATS=30
 IS_RUN_MODEL_OPTIMIZE=false
@@ -26,13 +27,25 @@ NUM_THREADS_LIST=(1 2 4)
 MODELS_LIST=$(ls $MODELS_DIR)
 
 # Check input
+if [ $# -gt  2 ];
+then
+    RESULT_FILENAME=$3
+fi
 if [ $# -gt  3 ];
 then
-    IS_RUN_MODEL_OPTIMIZE=$4
+    INPUT_SHAPE=$4
 fi
 if [ $# -gt  4 ];
 then
-    IS_RUN_QUANTIZED_MODEL=$5
+    POWER_MODE=$5
+fi
+if [ $# -gt  5 ];
+then
+    IS_RUN_MODEL_OPTIMIZE=$6
+fi
+if [ $# -gt  6 ];
+then
+    IS_RUN_QUANTIZED_MODEL=$7
 fi
 
 # Adb push benchmark_bin, models
@@ -41,23 +54,26 @@ adb shell chmod +x $ANDROID_DIR/benchmark_bin
 adb push $MODELS_DIR $ANDROID_DIR
 
 # Run benchmark
-adb shell "echo 'PaddleLite Benchmark' > $ANDROID_DIR/$RESULT_FILENAME"
+adb shell "echo 'PaddleLite Benchmark (in ms)\n' > $ANDROID_DIR/$RESULT_FILENAME"
 for threads in ${NUM_THREADS_LIST[@]}; do
-    adb shell "echo Threads=$threads Warmup=$WARMUP Repeats=$REPEATS >> $ANDROID_DIR/$RESULT_FILENAME"
+    adb shell "echo threads=$threads warmup=$WARMUP repeats=$REPEATS input_shape=$INPUT_SHAPE power_mode=$POWER_MODE >> $ANDROID_DIR/$RESULT_FILENAME"
     for model_name in ${MODELS_LIST[@]}; do
       echo "Model=$model_name Threads=$threads"
       adb shell "$ANDROID_DIR/benchmark_bin \
                    --model_dir=$ANDROID_DIR/${MODELS_DIR}/$model_name \
+                   --input_shape=$INPUT_SHAPE \
                    --warmup=$WARMUP \
                    --repeats=$REPEATS \
                    --threads=$threads \
+                   --power_mode=$POWER_MODE \
                    --result_filename=$ANDROID_DIR/$RESULT_FILENAME \
                    --run_model_optimize=$IS_RUN_MODEL_OPTIMIZE \
                    --is_quantized_model=$IS_RUN_QUANTIZED_MODEL"
     done
     adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
 done
-
+adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
+adb shell "echo power_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores,  3 for no bind >> $ANDROID_DIR/$RESULT_FILENAME"
 # Adb pull benchmark result, show result
 adb pull $ANDROID_DIR/$RESULT_FILENAME .
 echo "\n--------------------------------------"
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 319f26ff82dd47718a7fc69d64522ca622ecaf3e..7bb330b28bc51ca4a241831bd320cb25474a74cd 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -14,7 +14,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
 
 # global variables
 BUILD_EXTRA=OFF
-BUILD_JAVA=ON
+BUILD_JAVA=OFF
 BUILD_PYTHON=OFF
 BUILD_DIR=$(pwd)
 OPTMODEL_DIR=""
@@ -26,6 +26,12 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t
 
 readonly workspace=$PWD
 
+# if operating in mac env, we should expand the maximum file num
+os_nmae=`uname -s`
+if [ ${os_nmae} == "Darwin" ]; then
+   ulimit -n 1024
+fi
+
 # for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
 # here we fake an empty file to make cmake works.
 function prepare_workspace {
@@ -56,17 +62,17 @@ function prepare_thirdparty {
     fi
 }
 
-function build_model_optimize_tool {
+function build_opt {
     cd $workspace
     prepare_thirdparty
-    mkdir -p build.model_optimize_tool
-    cd build.model_optimize_tool
+    mkdir -p build.opt
+    cd build.opt
     cmake .. -DWITH_LITE=ON \
       -DLITE_ON_MODEL_OPTIMIZE_TOOL=ON \
       -DWITH_TESTING=OFF \
       -DLITE_BUILD_EXTRA=ON \
       -DWITH_MKL=OFF
-    make model_optimize_tool -j$NUM_PROC
+    make opt -j$NUM_PROC
 }
 
 function make_tiny_publish_so {
@@ -389,7 +395,7 @@ function main {
                 shift
                 ;;
             build_optimize_tool)
-                build_model_optimize_tool
+                build_opt
                 shift
                 ;;
             cuda)
diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f4cfee5ec6b9256d94377cc8814ad73f64ca0546
--- /dev/null
+++ b/lite/tools/build_bm.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+set -ex
+
+# global variables with default value
+BM_SDK_ROOT="$(pwd)/../BM_SDK"     # BM SDK
+TARGET_NAME="BM1682"     # default target
+BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
+WITH_TESTING=ON                    # ON/OFF
+
+function print_usage {
+    echo -e "\nUSAGE:"
+    echo
+    echo "----------------------------------------"
+    echo -e "--bm_sdk_root=<bm sdk directory>"
+    echo -e "--target_name=<target name>"
+    echo "----------------------------------------"
+    echo
+}
+
+# readonly variables with default value
+readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
+                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
+                               -DWITH_PYTHON=OFF \
+                               -DLITE_WITH_ARM=OFF"
+
+readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THRLITE_BUILD_THREADSEADS:-1}
+
+readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+readonly workspace=$(pwd)
+
+function prepare_thirdparty {
+    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+        rm -rf $workspace/third-party
+
+        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+            wget $THIRDPARTY_TAR
+        fi
+        tar xzf third-party-05b862.tar.gz
+    else
+        git submodule update --init --recursive
+    fi
+}
+
+# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
+# here we fake an empty file to make cmake works.
+function prepare_workspace {
+    # in build directory
+    # 1. Prepare gen_code file
+    GEN_CODE_PATH_PREFIX=lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    # 2.Prepare debug tool
+    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
+    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
+    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
+
+    # clone submodule
+    # git submodule update --init --recursive
+    prepare_thirdparty
+}
+
+function build_bm {
+    build_dir=${workspace}/build.lite.bm
+    mkdir -p $build_dir
+    cd $build_dir
+
+    prepare_workspace
+    cmake .. \
+        ${CMAKE_COMMON_OPTIONS} \
+        -DWITH_GPU=OFF \
+        -DWITH_MKLDNN=OFF \
+        -DLITE_WITH_X86=ON \
+        -DWITH_MKL=ON \
+        -DLITE_BUILD_EXTRA=ON \
+        -DLITE_WITH_XPU=OFF \
+        -DLITE_WITH_BM=ON \
+        -DWITH_TESTING=${WITH_TESTING} \
+        -DBM_SDK_ROOT=${BM_SDK_ROOT}
+
+    make -j$NUM_CORES_FOR_COMPILE
+
+    cd -
+    echo "Done"
+}
+
+function main {
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            --target_name=*)
+                TARGET_NAME="${i#*=}"
+                shift
+                ;;
+            --bm_sdk_root=*)
+                BM_SDK_ROOT="${i#*=}"
+                shift
+                ;;
+            bm)
+                build_bm
+                shift
+                ;;
+            *)
+                # unknown option
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+}
+
+main $@
diff --git a/lite/tools/build_npu.sh b/lite/tools/build_npu.sh
index 1509f563b2e4f2008e7ea4f37ca4e5491464e9cc..1515cfcdd3e69391b4d1a96688c7dc75f40e6dc2 100755
--- a/lite/tools/build_npu.sh
+++ b/lite/tools/build_npu.sh
@@ -7,7 +7,7 @@ ARM_ABI="armv8"                     # armv8, armv7
 ARM_LANG="gcc"                      # gcc only yet
 ANDROID_STL="c++_shared"            # c++_shared/c++_static, c++_shared is used by HiAI DDK 310
 DDK_ROOT="$(pwd)/ai_ddk_lib/"       # HiAI DDK 310 from https://developer.huawei.com/consumer/cn/hiai/
-TARGET_NAME="test_npu_pass"         # default target
+TARGET_NAME="test_subgraph_pass"    # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
 WITH_JAVA=ON                        # ON(build jar and jni so)/OFF
 WITH_TESTING=ON                     # ON/OFF
diff --git a/lite/tools/build_xpu.sh b/lite/tools/build_xpu.sh
index 62a123c82b2945147fa8616ad8faf0af33a32302..fdf287501e8f4411f51e73c55b789753f2e85674 100755
--- a/lite/tools/build_xpu.sh
+++ b/lite/tools/build_xpu.sh
@@ -3,7 +3,7 @@ set -ex
 
 # global variables with default value
 XPU_SDK_ROOT="$(pwd)/../XPU_SDK"    # XPU SDK
-TARGET_NAME="lite_compile_deps"     # default target
+TARGET_NAME="test_subgraph_pass"    # default target
 BUILD_EXTRA=ON                      # ON(with sequence ops)/OFF
 WITH_TESTING=ON                     # ON/OFF
 
@@ -73,8 +73,8 @@ function build_xpu {
         -DWITH_MKLDNN=OFF \
         -DLITE_WITH_X86=ON \
         -DWITH_MKL=ON \
-        -DLITE_BUILD_EXTRA=ON \
         -DLITE_WITH_XPU=ON \
+        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
         -DWITH_TESTING=${WITH_TESTING} \
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
 
@@ -104,6 +104,11 @@ function main {
                 build_xpu
                 shift
                 ;;
+            full_publish)
+                TARGET_NAME=publish_inference
+                build_xpu
+                shift
+                ;;
             *)
                 # unknown option
                 print_usage
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 8b5741a7a68bee3e783dff68e4bd4a8fc7cd8527..1960dc1e1506f9742cdd9be41d5448c646c026af 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -14,6 +14,16 @@ readonly workspace=$PWD
 
 NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-8}
 
+# global variables
+#whether to use emulator as adb devices,when USE_ADB_EMULATOR=ON we use emulator, else we will use connected mobile phone as adb devices.
+USE_ADB_EMULATOR=ON
+
+# if operating in mac env, we should expand the maximum file num
+os_nmae=`uname -s`
+if [ ${os_nmae} == "Darwin" ]; then
+   ulimit -n 1024
+fi
+
 function prepare_thirdparty {
     if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
         rm -rf $workspace/third-party
@@ -27,6 +37,28 @@ function prepare_thirdparty {
     fi
 }
 
+# prepare adb devices
+# if USE_ADB_EMULATOR=ON , we create adb emulator port_armv8 and port_armv7 for usage, else we will use actual mobilephone according to adbindex.
+function prepare_adb_devices {
+    port_armv8=5554
+    port_armv7=5556
+    if [ $USE_ADB_EMULATOR == "ON" ]; then
+       prepare_emulator $port_armv8 $port_armv7
+       device_armv8=emulator-$port_armv8
+       device_armv7=emulator-$port_armv7
+    else
+       adb_devices=($(adb devices |grep -v devices |grep device | awk -F " " '{print $1}'))
+       # adbindex is the env variable registered in ci agent to tell which mobile is to used as adb
+       adbindex_pos=`expr ${adbindex} + 1`
+       if [ ${adbindex_pos} -gt ${#adb_devices[@]} ]; then
+           echo -e "Error: the adb devices on ci agent are not enough, at least ${adbindex_pos} adb devices are needed."
+           exit 1
+       fi
+       echo ${adb_devices[${adbindex}]}
+       device_armv8=${adb_devices[${adbindex}]}
+       device_armv7=${adb_devices[${adbindex}]}
+    fi
+}
 
 # for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
 # here we fake an empty file to make cmake works.
@@ -77,7 +109,7 @@ function cmake_opencl {
 }
 
 function run_gen_code_test {
-    local port=$1
+    local device=$1
     local gen_code_file_name="__generated_code__.cc"
     local gen_code_file_path="./lite/gen_code/${gen_code_file_path}"
     local adb_work_dir="/data/local/tmp"
@@ -87,20 +119,20 @@ function run_gen_code_test {
 
     # 2. run test_cxx_api_lite in emulator to get opt model 
     local test_cxx_api_lite_path=$(find ./lite -name test_cxx_api)
-    adb -s emulator-${port} push "./third_party/install/lite_naive_model" ${adb_work_dir}
-    adb -s emulator-${port} push ${test_cxx_api_lite_path} ${adb_work_dir}
-    adb -s emulator-${port} shell "${adb_work_dir}/test_cxx_api --model_dir=${adb_work_dir}/lite_naive_model --optimized_model=${adb_work_dir}/lite_naive_model_opt"
+    adb -s ${device} push "./third_party/install/lite_naive_model" ${adb_work_dir}
+    adb -s ${device} push ${test_cxx_api_lite_path} ${adb_work_dir}
+    adb -s ${device} shell "${adb_work_dir}/test_cxx_api --model_dir=${adb_work_dir}/lite_naive_model --optimized_model=${adb_work_dir}/lite_naive_model_opt"
 
     # 3. build test_gen_code
     make test_gen_code -j$NUM_CORES_FOR_COMPILE
 
     # 4. run test_gen_code_lite in emulator to get __generated_code__.cc
     local test_gen_code_lite_path=$(find ./lite -name test_gen_code)
-    adb -s emulator-${port} push ${test_gen_code_lite_path} ${adb_work_dir}
-    adb -s emulator-${port} shell "${adb_work_dir}/test_gen_code --optimized_model=${adb_work_dir}/lite_naive_model_opt --generated_code_file=${adb_work_dir}/${gen_code_file_name}"
+    adb -s ${device} push ${test_gen_code_lite_path} ${adb_work_dir}
+    adb -s ${device} shell "${adb_work_dir}/test_gen_code --optimized_model=${adb_work_dir}/lite_naive_model_opt --generated_code_file=${adb_work_dir}/${gen_code_file_name}"
 
     # 5. pull __generated_code__.cc down and mv to build real path
-    adb -s emulator-${port} pull "${adb_work_dir}/${gen_code_file_name}" .
+    adb -s ${device} pull "${adb_work_dir}/${gen_code_file_name}" .
     mv ${gen_code_file_name} ${gen_code_file_path}
 
     # 6. build test_generated_code
@@ -324,12 +356,12 @@ function build_test_xpu {
 # test_arm_android <some_test_name> <adb_port_number>
 function test_arm_android {
     local test_name=$1
-    local port=$2
+    local device=$2
     if [[ "${test_name}x" == "x" ]]; then
         echo "test_name can not be empty"
         exit 1
     fi
-    if [[ "${port}x" == "x" ]]; then
+    if [[ "${device}x" == "x" ]]; then
         echo "Port can not be empty"
         exit 1
     fi
@@ -344,20 +376,20 @@ function test_arm_android {
 
     local testpath=$(find ./lite -name ${test_name})
 
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell "cd ${adb_work_dir} && ./${test_name}"
-    adb -s emulator-${port} shell "rm ${adb_work_dir}/${test_name}"
+    adb -s ${device} push ${testpath} ${adb_work_dir}
+    adb -s ${device} shell "cd ${adb_work_dir} && ./${test_name}"
+    adb -s ${device} shell "rm ${adb_work_dir}/${test_name}"
 }
 
 # test_npu <some_test_name> <adb_port_number>
 function test_npu {
     local test_name=$1
-    local port=$2
+    local device=$2
     if [[ "${test_name}x" == "x" ]]; then
         echo "test_name can not be empty"
         exit 1
     fi
-    if [[ "${port}x" == "x" ]]; then
+    if [[ "${device}x" == "x" ]]; then
         echo "Port can not be empty"
         exit 1
     fi
@@ -373,33 +405,33 @@ function test_npu {
     local testpath=$(find ./lite -name ${test_name})
 
     # note the ai_ddk_lib is under paddle-lite root directory
-    adb -s emulator-${port} push ../ai_ddk_lib/lib64/* ${adb_work_dir}
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+    adb -s ${device} push ../ai_ddk_lib/lib64/* ${adb_work_dir}
+    adb -s ${device} push ${testpath} ${adb_work_dir}
 
     if [[ ${test_name} == "test_npu_pass" ]]; then
         local model_name=mobilenet_v1
-        adb -s emulator-${port} push "./third_party/install/${model_name}" ${adb_work_dir}
-        adb -s emulator-${port} shell "rm -rf ${adb_work_dir}/${model_name}_opt "
-        adb -s emulator-${port} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; export GLOG_v=0; ./${test_name} --model_dir=./${model_name} --optimized_model=./${model_name}_opt"
+        adb -s ${device} push "./third_party/install/${model_name}" ${adb_work_dir}
+        adb -s ${device} shell "rm -rf ${adb_work_dir}/${model_name}_opt "
+        adb -s ${device} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; export GLOG_v=0; ./${test_name} --model_dir=./${model_name} --optimized_model=./${model_name}_opt"
     elif [[ ${test_name} == "test_subgraph_pass" ]]; then
         local model_name=mobilenet_v1
-        adb -s emulator-${port} push "./third_party/install/${model_name}" ${adb_work_dir}
-        adb -s emulator-${port} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; export GLOG_v=0; ./${test_name} --model_dir=./${model_name}"
+        adb -s ${device} push "./third_party/install/${model_name}" ${adb_work_dir}
+        adb -s ${device} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; export GLOG_v=0; ./${test_name} --model_dir=./${model_name}"
     else
-        adb -s emulator-${port} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; ./${test_name}"
+        adb -s ${device} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; ./${test_name}"
     fi
 }
 
 function test_npu_model {
     local test_name=$1
-    local port=$2
+    local device=$2
     local model_dir=$3
 
     if [[ "${test_name}x" == "x" ]]; then
         echo "test_name can not be empty"
         exit 1
     fi
-    if [[ "${port}x" == "x" ]]; then
+    if [[ "${device}x" == "x" ]]; then
         echo "Port can not be empty"
         exit 1
     fi
@@ -412,17 +444,17 @@ function test_npu_model {
     adb_work_dir="/data/local/tmp"
 
     testpath=$(find ./lite -name ${test_name})
-    adb -s emulator-${port} push ../ai_ddk_lib/lib64/* ${adb_work_dir}
-    adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
+    adb -s ${device} push ../ai_ddk_lib/lib64/* ${adb_work_dir}
+    adb -s ${device} push ${model_dir} ${adb_work_dir}
+    adb -s ${device} push ${testpath} ${adb_work_dir}
+    adb -s ${device} shell chmod +x "${adb_work_dir}/${test_name}"
     local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
-    adb -s emulator-${port} shell "export LD_LIBRARY_PATH=${adb_work_dir}; ${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
+    adb -s ${device} shell "export LD_LIBRARY_PATH=${adb_work_dir}; ${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
 }
 
 # test the inference high level api
 function test_arm_api {
-    local port=$1
+    local device=$1
     local test_name="test_paddle_api"
 
     make $test_name -j$NUM_CORES_FOR_COMPILE
@@ -431,23 +463,23 @@ function test_arm_api {
     local remote_model=${adb_work_dir}/paddle_api
     local testpath=$(find ./lite -name ${test_name})
 
-    arm_push_necessary_file $port $model_path $remote_model
-    adb -s emulator-${port} shell mkdir -p $remote_model
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
-    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --model_dir $remote_model"
+    arm_push_necessary_file $device $model_path $remote_model
+    adb -s ${device} shell mkdir -p $remote_model
+    adb -s ${device} push ${testpath} ${adb_work_dir}
+    adb -s ${device} shell chmod +x "${adb_work_dir}/${test_name}"
+    adb -s ${device} shell "${adb_work_dir}/${test_name} --model_dir $remote_model"
 }
 
 function test_arm_model {
     local test_name=$1
-    local port=$2
+    local device=$2
     local model_dir=$3
 
     if [[ "${test_name}x" == "x" ]]; then
         echo "test_name can not be empty"
         exit 1
     fi
-    if [[ "${port}x" == "x" ]]; then
+    if [[ "${device}x" == "x" ]]; then
         echo "Port can not be empty"
         exit 1
     fi
@@ -460,11 +492,11 @@ function test_arm_model {
     adb_work_dir="/data/local/tmp"
 
     testpath=$(find ./lite -name ${test_name})
-    adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
+    adb -s ${device} push ${model_dir} ${adb_work_dir}
+    adb -s ${device} push ${testpath} ${adb_work_dir}
+    adb -s ${device} shell chmod +x "${adb_work_dir}/${test_name}"
     local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
-    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
+    adb -s ${device} shell "${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
 }
 
 # function _test_model_optimize_tool {
@@ -487,15 +519,15 @@ function test_model_optimize_tool_compile {
     cd $workspace
     cd build
     cmake .. -DWITH_LITE=ON -DLITE_ON_MODEL_OPTIMIZE_TOOL=ON -DWITH_TESTING=OFF -DLITE_BUILD_EXTRA=ON
-    make model_optimize_tool -j$NUM_CORES_FOR_COMPILE
+    make opt -j$NUM_CORES_FOR_COMPILE
 }
 
 function _test_paddle_code_generator {
-    local port=$1
+    local device=$1
     local test_name=paddle_code_generator
     local remote_test=$ADB_WORK_DIR/$test_name
     local remote_model=$ADB_WORK_DIR/lite_naive_model.opt
-    local adb="adb -s emulator-${port}"
+    local adb="adb -s ${device}"
 
     make paddle_code_generator -j$NUM_CORES_FOR_COMPILE
     local test_path=$(find . -name $test_name | head -n1)
@@ -576,8 +608,44 @@ function build_arm {
     cmake_arm ${os} ${abi} ${lang}
     build $TESTS_FILE
 
-    # test publish inference lib
-    make publish_inference
+}
+
+# $1: ARM_TARGET_OS in "ios", "ios64"
+# $2: ARM_TARGET_ARCH_ABI in "armv7", "armv8"
+function build_ios {
+    local os=$1
+    local abi=$2
+    build_dir=build.ios.${os}.${abi}
+    echo "building ios target into $build_dir"
+    echo "target os: $os"
+    echo "target abi: $abi"
+    mkdir -p ${build_dir}
+    cd ${build_dir}
+    GEN_CODE_PATH_PREFIX=lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    cmake .. \
+            -DWITH_GPU=OFF \
+            -DWITH_MKL=OFF \
+            -DWITH_LITE=ON \
+            -DLITE_WITH_CUDA=OFF \
+            -DLITE_WITH_X86=OFF \
+            -DLITE_WITH_ARM=ON \
+            -DWITH_TESTING=OFF \
+            -DLITE_WITH_JAVA=OFF \
+            -DLITE_SHUTDOWN_LOG=ON \
+            -DLITE_ON_TINY_PUBLISH=ON \
+            -DLITE_WITH_OPENMP=OFF \
+            -DWITH_ARM_DOTPROD=OFF \
+            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+            -DARM_TARGET_ARCH_ABI=$abi \
+            -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
+            -DLITE_WITH_CV=$BUILD_CV \
+            -DARM_TARGET_OS=$os
+
+    make -j4 publish_inference
+    cd -
 }
 
 # $1: ARM_TARGET_OS in "android"
@@ -614,7 +682,7 @@ function test_arm {
     os=$1
     abi=$2
     lang=$3
-    port=$4
+    device=$4
 
     if [[ ${os} == "armlinux" ]]; then
         # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
@@ -628,16 +696,16 @@ function test_arm {
     fi
 
     # prepare for CXXApi test
-    local adb="adb -s emulator-${port}"
+    local adb="adb -s ${device}"
     $adb shell mkdir -p /data/local/tmp/lite_naive_model_opt
 
     echo "test file: ${TESTS_FILE}"
     for _test in $(cat $TESTS_FILE); do
-        test_arm_android $_test $port
+        test_arm_android $_test $device
     done
 
     # test finally
-    test_arm_api $port
+    test_arm_api $device
 
     # _test_model_optimize_tool $port
     # _test_paddle_code_generator $port
@@ -661,11 +729,11 @@ function prepare_emulator {
 }
 
 function arm_push_necessary_file {
-    local port=$1
+    local device=$1
     local testpath=$2
     local adb_work_dir=$3
 
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+    adb -s ${device} push ${testpath} ${adb_work_dir}
 }
 
 function build_test_arm_opencl {
@@ -688,15 +756,13 @@ function build_test_arm_opencl {
 function build_test_arm_subtask_android {
     ########################################################################
     # job 1-4 must be in one runner
-    port_armv8=5554
-    port_armv7=5556
-
-    prepare_emulator $port_armv8 $port_armv7
+    prepare_adb_devices
 
     # job 1
     build_arm "android" "armv8" "gcc"
-    run_gen_code_test ${port_armv8}
-    test_arm "android" "armv8" "gcc" ${port_armv8}
+    adb -s $device_armv8 shell 'rm -rf /data/local/tmp/*'
+    run_gen_code_test ${device_armv8}
+    test_arm "android" "armv8" "gcc" ${device_armv8}
     cd -
 
     # job 2
@@ -707,8 +773,9 @@ function build_test_arm_subtask_android {
 
     # job 3
     build_arm "android" "armv7" "gcc"
-    run_gen_code_test ${port_armv7}
-    test_arm "android" "armv7" "gcc" ${port_armv7}
+    adb -s $device_armv7 shell 'rm -rf /data/local/tmp/*'
+    run_gen_code_test ${device_armv7}
+    test_arm "android" "armv7" "gcc" ${device_armv7}
     cd -
 
     # job 4
@@ -717,7 +784,9 @@ function build_test_arm_subtask_android {
     #test_arm "android" "armv7" "clang" ${port_armv7}
     #cd -
 
-    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+    if [ $USE_ADB_EMULATOR == "ON" ]; then
+        adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+    fi
     echo "Done"
 }
 
@@ -726,26 +795,39 @@ function build_test_arm_subtask_armlinux {
     cur=$PWD
     # job 5
     build_arm "armlinux" "armv8" "gcc"
-    test_arm "armlinux" "armv8" "gcc" $port_armv8
+    test_arm "armlinux" "armv8" "gcc" $device_armv8
     cd $cur
 
     # job 6
     build_arm "armlinux" "armv7" "gcc"
-    test_arm "armlinux" "armv7" "gcc" $port_armv8
+    test_arm "armlinux" "armv7" "gcc" $device_armv8
     cd $cur
 
     # job 7
     build_arm "armlinux" "armv7hf" "gcc"
-    test_arm "armlinux" "armv7hf" "gcc" $port_armv8
+    test_arm "armlinux" "armv7hf" "gcc" $device_armv8
+    cd $cur
+
+    echo "Done"
+}
+
+# sub-task3
+# this task will test IOS compiling, which requires cmake_version>=3.15
+function build_test_arm_subtask_ios {
+    cur=$PWD
+    # job 8
+    build_ios "ios" "armv7"
+    cd $cur
+
+    # job 9
+    build_ios "ios64" "armv8"
     cd $cur
 
     echo "Done"
 }
 
-# sub-task-model
+# this method need to invoke `build_test_arm_subtask_android` first.
 function build_test_arm_subtask_model {
-    local port_armv8=5554
-    local port_armv7=5556
     # We just test following single one environment to limit the CI time.
     local os=android
     local abi=armv8
@@ -756,35 +838,36 @@ function build_test_arm_subtask_model {
 
     cur_dir=$(pwd)
     build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
-    mkdir -p $build_dir
     cd $build_dir
-    cmake_arm $os $abi $lang
     make $test_name -j$NUM_CORES_FOR_COMPILE
 
-    prepare_emulator $port_armv8 $port_armv7
+    # prepare adb devices
+    prepare_adb_devices
+    adb -s $device_armv8 shell 'rm -rf /data/local/tmp/*'
 
     # just test the model on armv8
-    test_arm_model $test_name $port_armv8 "./third_party/install/$model_name"
+    test_arm_model $test_name $device_armv8 "./third_party/install/$model_name"
 
-    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+    if [ $USE_ADB_EMULATOR == "ON" ]; then
+        adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+    fi
     echo "Done"
     cd -
-    rm -rf $build_dir
 }
 
 
 # this test load a model, optimize it and check the prediction result of both cxx and light APIS.
 function test_arm_predict_apis {
-    local port=$1
+    local device=$1
     local workspace=$2
     local naive_model_path=$3
     local api_test_path=$(find . -name "test_apis")
     # the model is pushed to ./lite_naive_model
-    adb -s emulator-${port} push ${naive_model_path} ${workspace}
-    adb -s emulator-${port} push $api_test_path ${workspace}
+    adb -s ${device} push ${naive_model_path} ${workspace}
+    adb -s ${device} push $api_test_path ${workspace}
 
     # test cxx_api first to store the optimized model.
-    adb -s emulator-${port} shell ./test_apis --model_dir ./lite_naive_model --optimized_model ./lite_naive_model_opt
+    adb -s ${device} shell ./test_apis --model_dir ./lite_naive_model --optimized_model ./lite_naive_model_opt
 }
 
 
@@ -792,9 +875,6 @@ function test_arm_predict_apis {
 function build_test_arm {
     ########################################################################
     # job 1-4 must be in one runner
-    port_armv8=5554
-    port_armv7=5556
-
     build_test_arm_subtask_android
     build_test_arm_subtask_armlinux
 }
@@ -816,16 +896,19 @@ function build_test_npu {
     # just test the model on armv8
     # prepare_emulator $port_armv8
 
+    prepare_emulator $port_armv8 $port_armv7
+    local device_armv8=emulator-$port_armv8
+
     if [[ "${test_name}x" != "x" ]]; then
-        test_npu ${test_name} ${port_armv8}
+        test_npu ${test_name} ${device_armv8}
     else
         # run_gen_code_test ${port_armv8}
         for _test in $(cat $TESTS_FILE | grep npu); do
-            test_npu $_test $port_armv8
+            test_npu $_test $device_armv8
         done
     fi
 
-    test_npu_model $test_model_name $port_armv8 "./third_party/install/$model_name"
+    test_npu_model $test_model_name $device_armv8 "./third_party/install/$model_name"
     cd -
     # just test the model on armv8
     # adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
@@ -910,6 +993,10 @@ function main {
                 ARM_PORT="${i#*=}"
                 shift
                 ;;
+            --use_adb_emulator=*)
+                USE_ADB_EMULATOR="${i#*=}"
+                shift
+                ;;
             build)
                 build $TESTS_FILE
                 build $LIBS_FILE
@@ -997,27 +1084,19 @@ function main {
                 ;;
             build_test_arm_subtask_android)
                 build_test_arm_subtask_android
-                shift
-                ;;
-            build_test_arm_subtask_armlinux)
-                build_test_arm_subtask_armlinux
-                shift
-                ;;
-            build_test_arm_model_mobilenetv1)
                 build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1
                 build_test_arm_subtask_model test_mobilenetv1_int8 MobileNetV1_quant
-                shift
-                ;;
-            build_test_arm_model_mobilenetv2)
                 build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
+                build_test_arm_subtask_model test_resnet50 resnet50
+                build_test_arm_subtask_model test_inceptionv4 inception_v4_simple
                 shift
                 ;;
-            build_test_arm_model_resnet50)
-                build_test_arm_subtask_model test_resnet50 resnet50
+            build_test_arm_subtask_armlinux)
+                build_test_arm_subtask_armlinux
                 shift
                 ;;
-            build_test_arm_model_inceptionv4)
-                build_test_arm_subtask_model test_inceptionv4 inception_v4_simple
+            build_test_arm_subtask_ios)
+                build_test_arm_subtask_ios
                 shift
                 ;;
             check_style)
diff --git a/lite/tools/cmake_tools/create_fake_kernel_registry.py b/lite/tools/cmake_tools/create_fake_kernel_registry.py
index 140d77320704f62dfb2492eec3ad7238fe3868ff..35012d5b163aac2b6998790b4cfcf31e16cb1454 100644
--- a/lite/tools/cmake_tools/create_fake_kernel_registry.py
+++ b/lite/tools/cmake_tools/create_fake_kernel_registry.py
@@ -18,6 +18,9 @@ import logging
 from ast import RegisterLiteKernelParser
 from utils import *
 
+if len(sys.argv) != 4:
+    print("Error: create_fake_kernel_registry.py requires three inputs!")
+    exit(1)
 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
 kernelmap_path = sys.argv[3]
diff --git a/lite/tools/cmake_tools/parse_kernel_registry.py b/lite/tools/cmake_tools/parse_kernel_registry.py
index f4f0b95483687d3785168c132d30ac8a4fa87c8e..6c020ec438682b670e4e36a926095fed5452ec37 100644
--- a/lite/tools/cmake_tools/parse_kernel_registry.py
+++ b/lite/tools/cmake_tools/parse_kernel_registry.py
@@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
 import sys
 import logging
 from ast import RegisterLiteKernelParser
 
+if len(sys.argv) != 5:
+    print("Error: parse_kernel_registry.py requires four inputs!")
+    exit(1)
 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
 minkernels_list_path = sys.argv[3]
diff --git a/lite/tools/cmake_tools/parse_op_registry.py b/lite/tools/cmake_tools/parse_op_registry.py
index db58c455a9d5863ec0c66d7783871831c73c120f..7eb3337ed87b708102b2032de9a279fcae2d321c 100644
--- a/lite/tools/cmake_tools/parse_op_registry.py
+++ b/lite/tools/cmake_tools/parse_op_registry.py
@@ -13,10 +13,14 @@
 # limitations under the License.
 ''' Collect op registry information. '''
 
+from __future__ import print_function
 import sys
 import logging
 from ast import RegisterLiteOpParser
 
+if len(sys.argv) != 5:
+    print("Error: parse_op_registry.py requires four inputs!")
+    exit(1)
 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
 minops_list_path = sys.argv[3]
diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6a3af6bd3e5a2decfb6b3b65b0357bff8b4a378
--- /dev/null
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import logging
+from ast import RegisterLiteKernelParser
+from ast import RegisterLiteOpParser
+
+if len(sys.argv) != 4:
+    print("Error: record_supported_kernel_op.py requires three inputs!")
+    exit(1)
+kernels_list_path = sys.argv[1]
+ops_list_path = sys.argv[2]
+kernel_op_map_dest_path = sys.argv[3]
+
+
+out_lines = [
+'''
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include<vector>
+#include<map>
+#include<string>
+
+const std::vector<std::vector<std::string>> supported_ops_target = {
+'''
+]
+
+ops_lines=[]
+
+# valid targets and valid_ops
+valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"]
+valid_ops = [[],[],[],[],[],[],[],[],[],[]]
+class TargetType:
+    kUnk = 0
+    kHost = 1
+    kX86 = 2
+    kCUDA = 3
+    kARM = 4
+    kOpenCL = 5
+    kFPGA = 7
+    kNPU = 8
+    kXPU = 9
+    kAny = 6  # any target
+
+# record op_info of valid kernels into `valid_ops` according to different target type
+with open(kernels_list_path) as f:
+    paths = set([path for path in f])
+    for path in paths:
+        with open(path.strip()) as g:
+            c = g.read()
+            kernel_parser = RegisterLiteKernelParser(c)
+            kernel_parser.parse()
+            for k in kernel_parser.kernels:
+                if hasattr(TargetType, k.target):
+                    index=getattr(TargetType, k.target)
+                    valid_ops[index].append(k.op_type)
+
+# clear the repeated ops
+for target in valid_targets:
+    index = getattr(TargetType, target)
+    valid_ops[index] = list(set(valid_ops[index]))
+
+paths = set()
+with open(ops_list_path) as f:
+    paths = set([path for path in f])
+    for path in paths:
+        str_info = open(path.strip()).read()
+        op_parser = RegisterLiteOpParser(str_info)
+        ops = op_parser.parse()
+        for op in ops:
+            if "_grad" in op:
+                continue
+            out = '    {"%s", { "' % op
+            op_targets = []
+            for target in valid_targets:
+                if op in valid_ops[getattr(TargetType, target)]:
+                    op_targets.append(target)
+            if len(op_targets) > 0:
+                out = out +'", "'.join(op_targets)+ '" }}'
+            else:
+                # unknow type op:  kUnk = 0
+                valid_ops[0].append(op)
+                out = out +'kUnk" }}'
+            ops_lines.append(out)
+
+with open(kernel_op_map_dest_path, 'w') as f:
+    logging.info("write kernel list to %s" % kernel_op_map_dest_path)
+    f.write('\n'.join(out_lines))
+    # write kernels into head file
+    for target in valid_targets:
+        if len(valid_ops[getattr(TargetType, target)]) == 0 :
+            f.write("\n    // %s_OPS: " %target)
+            f.write('\n    {},')
+        else:
+            f.write("\n    // %s_OPS: " %target)
+            f.write('\n    {"')
+            f.write('","'.join(valid_ops[getattr(TargetType, target)]))
+            f.write('"},\n')
+    f.write('};')
+    # write op info into head file
+    f.write('\nconst std::map<std::string, std::vector<std::string>> supported_ops={\n')
+    f.write(',\n'.join(ops_lines))
+    f.write('\n};')
diff --git a/lite/tools/debug/CMakeLists.txt b/lite/tools/debug/CMakeLists.txt
index 43c0812ab91f6ddcba02f93d2eea60f5a5268341..a82c15405062172aa9adcecb68d95db6d7ed2330 100644
--- a/lite/tools/debug/CMakeLists.txt
+++ b/lite/tools/debug/CMakeLists.txt
@@ -1,7 +1,9 @@
-lite_cc_library(debug_utils SRCS debug_utils.cc DEPS op_params model_parser)
+if(NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
+    return()
+endif()
 
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL)
-  lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc
+lite_cc_library(debug_utils SRCS debug_utils.cc DEPS op_params model_parser)
+lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc
     DEPS
     cxx_api
     debug_utils
@@ -16,4 +18,3 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL)
     XPU_DEPS ${xpu_kernels}
     FPGA_DEPS ${fpga_kernels}
     CL_DEPS ${opencl_kernels})
-endif()
diff --git a/lite/tools/debug/debug_utils.h b/lite/tools/debug/debug_utils.h
index 7f77b90488657aab96c7942d703e86d64723f5fc..d2659c2c7f9a156cde5a0dd5e57efe12787a43d0 100644
--- a/lite/tools/debug/debug_utils.h
+++ b/lite/tools/debug/debug_utils.h
@@ -148,8 +148,8 @@ void PrepareModelInputTensor(const DebugConfig& conf,
     auto* input_tensor = &feed_var->at(item.first);
     input_tensor->Resize(DDim(dim));
     switch (val_type) {
-#define FILL_TENSOR_BY_TYPE_ONCE(pb_type__, type__)         \
-  case framework::proto::VarType::pb_type__:                \
+#define FILL_TENSOR_BY_TYPE_ONCE(var_type__, type__)        \
+  case VarDescAPI::Type::var_type__:                        \
     FillTensorData<type__>(input_tensor, conf, item.first); \
     break
 
diff --git a/lite/utils/cv/CMakeLists.txt b/lite/utils/cv/CMakeLists.txt
index 01f5341c972342afa13fabaf5183a7d5d8543c7f..6c88e70de125b650bcf576fd686373c59e37454c 100644
--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
@@ -1,5 +1,4 @@
 if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
-    set(lite_cv_deps)
     lite_cc_library(paddle_cv_arm SRCS
             image_convert.cc
             paddle_image_preprocess.cc
@@ -7,5 +6,5 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_
             image_flip.cc
             image_rotate.cc
             image_resize.cc
-            DEPS ${lite_cv_deps} paddle_api_light)
+            DEPS paddle_api place)
 endif()
diff --git a/lite/utils/cv/image2tensor.cc b/lite/utils/cv/image2tensor.cc
index b51a82da1d0e9dc1750670ef55690e9a34a659fc..3a09039a0f53c9ac49a472b61b477dd6d2e5ac33 100644
--- a/lite/utils/cv/image2tensor.cc
+++ b/lite/utils/cv/image2tensor.cc
@@ -18,6 +18,13 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void gray_to_tensor(const uint8_t* src,
+                    float* output,
+                    int width,
+                    int height,
+                    float* means,
+                    float* scales);
+
 void bgr_to_tensor_chw(const uint8_t* src,
                        float* output,
                        int width,
@@ -52,7 +59,7 @@ void bgra_to_tensor_hwc(const uint8_t* src,
  * NCHW
   * param src: input image data
   * param dstTensor: output tensor data
-  * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param layout: output tensor layout，support NHWC and NCHW
@@ -79,6 +86,9 @@ void Image2Tensor::choose(const uint8_t* src,
   } else if (layout == LayoutType::kNHWC &&
              (srcFormat == BGRA || srcFormat == RGBA)) {
     impl_ = bgra_to_tensor_hwc;
+  } else if ((layout == LayoutType::kNHWC || layout == LayoutType::kNCHW) &&
+             (srcFormat == GRAY)) {
+    impl_ = gray_to_tensor;
   } else {
     printf("this layout: %d or image format: %d not support \n",
            static_cast<int>(layout),
@@ -87,6 +97,147 @@ void Image2Tensor::choose(const uint8_t* src,
   }
   impl_(src, output, srcw, srch, means, scales);
 }
+
+void gray_to_tensor(const uint8_t* src,
+                    float* output,
+                    int width,
+                    int height,
+                    float* means,
+                    float* scales) {
+  int size = width * height;
+  float mean_val = means[0];
+  float scale_val = scales[0];
+
+  int dim16 = width >> 16;
+  int remain = width % 16;
+
+  float32x4_t vmean = vdupq_n_f32(mean_val);
+  float32x4_t vscale = vdupq_n_f32(scale_val);
+#pragma omp parallel for
+  for (int i = 0; i < height; i += 1) {
+    const uint8_t* din_ptr = src + i * width;
+    float* ptr_h = output + i * width;
+    int cnt = dim16;
+    if (cnt > 0) {
+#ifdef __aarch64__
+      asm volatile(
+          "prfm   pldl1keep, [%[inptr0]]                \n"
+          "prfm   pldl1keep, [%[inptr0], #64]   \n"
+          "prfm   pldl1keep, [%[inptr0], #128]   \n"
+          "prfm   pldl1keep, [%[inptr0], #192]   \n"
+          "1:     \n"
+          "ld1 {v0.8b}, [%[inptr0]], #8 \n"  // d8 = y0y1y2.."
+          "ld1 {v1.8b}, [%[inptr0]], #8 \n"  // d8 = y0y1y2.."
+          // 8->16
+          "ushll v3.8h, v0.8b, #0  \n"
+          "ushll v4.8h, v0.8b, #0  \n"
+          // 16->32
+          "ushll v6.4s, v3.4h, #0   \n"
+          "ushll2 v7.4s, v3.8h, #0   \n"
+          "ushll v8.4s, v4.4h, #0   \n"
+          "ushll2 v9.4s, v4.8h, #0   \n"
+          // int32->fp32
+          "ucvtf v12.4s, v6.4s \n"
+          "ucvtf v13.4s, v7.4s \n"
+          "ucvtf v14.4s, v8.4s \n"
+          "ucvtf v15.4s, v9.4s \n"
+          // sub -mean
+          "fsub v12.4s, v12.4s, %w[vmean].4s \n"
+          "fsub v13.4s, v13.4s, %w[vmean].4s \n"
+          "fsub v14.4s, v14.4s, %w[vmean].4s \n"
+          "fsub v15.4s, v15.4s, %w[vmean].4s \n"
+          // mul * scale
+          "fmul v6.4s, v12.4s, %w[vscale].4s \n"
+          "fmul v7.4s, v13.4s, %w[vscale].4s \n"
+          "fmul v8.4s, v14.4s, %w[vscale].4s \n"
+          "fmul v9.4s, v15.4s, %w[vscale].4s \n"
+          // store
+          "st1 {v6.4s}, [%[outr0]], #16 \n"
+          "subs %w[cnt], %w[cnt], #1 \n"
+          "st1 {v7.4s}, [%[outr0]], #16 \n"
+          "st1 {v8.4s}, [%[outr0]], #16 \n"
+          "st1 {v9.4s}, [%[outr0]], #16 \n"
+          "bne 1b \n"
+          : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt)
+          : [vmean] "w"(vmean), [vscale] "w"(vscale)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15");
+#else
+      asm volatile(
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #64]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #192]                         @ preload a, 64byte\n"
+          "1: \n"
+          "vld1.8 {d12, d13}, [%[inptr0]]! \n"
+          // 8->16
+          "vmovl.u8 q8, d12 \n"
+          "vmovl.u8 q9, d13 \n"
+          // 16->32
+          "vmovl.u16 q11, d16 \n"
+          "vmovl.u16 q12, d17 \n"
+          "vmovl.u16 q13, d18 \n"
+          "vmovl.u16 q14, d19 \n"
+          // int32->fp32
+          "vcvt.f32.u32 q7, q11 \n"
+          "vcvt.f32.u32 q8, q12 \n"
+          "vcvt.f32.u32 q9, q13 \n"
+          "vcvt.f32.u32 q10, q14 \n"
+          // sub -mean
+          "vsub.f32 q7, q7, %q[vmean] \n"
+          "vsub.f32 q8, q8, %q[vmean] \n"
+          "vsub.f32 q9, q9, %q[vmean] \n"
+          "vsub.f32 q10, q10, %q[vmean] \n"
+          // mul *scale
+          "vmul.f32 q11, q7, %q[vscale] \n"
+          "vmul.f32 q12, q8, %q[vscale] \n"
+          "vmul.f32 q13, q9, %q[vscale] \n"
+          "vmul.f32 q14, q10, %q[vscale] \n"
+          // store
+          "vst1.32  {d22 - d23}, [%[outr0]]! \n"
+          "subs %[cnt], #1 \n"
+          "vst1.32  {d24 - d25}, [%[outr0]]! \n"
+          "vst1.32  {d26 - d27}, [%[outr0]]! \n"
+          "vst1.32  {d28 - d29}, [%[outr0]]! \n"
+          "bne 1b"
+          : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt)
+          : [vmean] "w"(vmean), [vscale] "w"(vscale)
+          : "cc",
+            "memory",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12",
+            "q13",
+            "q14");
+#endif
+    }
+    for (int j = 0; j < remain; j++) {
+      *ptr_h++ = (*din_ptr - mean_val) * scale_val;
+      din_ptr++;
+    }
+  }
+}
+
 void bgr_to_tensor_chw(const uint8_t* src,
                        float* output,
                        int width,
@@ -390,6 +541,7 @@ void bgra_to_tensor_chw(const uint8_t* src,
     }
   }
 }
+
 void bgr_to_tensor_hwc(const uint8_t* src,
                        float* output,
                        int width,
diff --git a/lite/utils/cv/image_convert.cc b/lite/utils/cv/image_convert.cc
index 24b6db70dd4f4fb1ad8e8c915444684d4db07cfd..385f56d233cb151445a086ed59d5c40374cd8c36 100644
--- a/lite/utils/cv/image_convert.cc
+++ b/lite/utils/cv/image_convert.cc
@@ -30,10 +30,14 @@ void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+// bgra rgba to gray
+void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // bgr rgb to gray
 void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // gray to bgr rgb
 void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+// gray to bgra rgba
+void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // bgr to bgra or rgb to rgba
 void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // bgra to bgr or rgba to rgb
@@ -112,6 +116,12 @@ void ImageConvert::choose(const uint8_t* src,
     } else if ((srcFormat == RGB && dstFormat == BGRA) ||
                (srcFormat == BGR && dstFormat == RGBA)) {
       impl_ = hwc3_trans_hwc4;
+    } else if ((srcFormat == GRAY && dstFormat == RGBA) ||
+               (srcFormat == GRAY && dstFormat == BGRA)) {
+      impl_ = hwc1_to_hwc4;
+    } else if ((srcFormat == RGBA && dstFormat == GRAY) ||
+               (srcFormat == BGRA && dstFormat == GRAY)) {
+      impl_ = hwc4_to_hwc1;
     } else {
       printf("srcFormat: %d, dstFormat: %d does not support! \n",
              srcFormat,
@@ -989,7 +999,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
           "vshrn.u32 d24, q6, #7 \n"
           "vshrn.u32 d25, q7, #7 \n"
           "vshrn.u32 d26, q8, #7 \n"
-          "vshrn.u32 d27, q8, #7 \n"
+          "vshrn.u32 d27, q9, #7 \n"
           // 16->8
           "vmovn.u16 d4, q10 \n"
           "vmovn.u16 d5, q11 \n"
@@ -1077,6 +1087,280 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   }
 }
 /*
+采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R
+采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B
+b = 0.114 *128 = 14.529 = 15
+g = 0.587 * 128 = 75.136 = 75
+r = 0.2989 * 127 = 38.2592 = 38
+Gray = (15*B + 75*G + 38*R)/128
+bgra2gray, rgba2gray
+*/
+void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  uint8_t b = 15;
+  uint8_t g = 75;
+  uint8_t r = 38;
+
+  uint8x8_t vb = vdup_n_u8(b);
+  uint8x8_t vg = vdup_n_u8(g);
+  uint8x8_t vr = vdup_n_u8(r);
+#ifdef __aarch64__
+#else
+  uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
+  uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
+  uint8_t vr_array[8] = {r, r, r, r, r, r, r, r};
+#endif
+  int cnt_pro = srcw >> 3;
+  int remain_pro = srcw % 8;
+  int win = srcw * 4;
+  int i = 0;
+#pragma omp parallel for
+  for (i = 0; i < srch - 3; i += 4) {
+    int j = 0;
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    uint8_t* outr0 = dst + i * srcw;
+    uint8_t* outr1 = outr0 + srcw;
+    uint8_t* outr2 = outr1 + srcw;
+    uint8_t* outr3 = outr2 + srcw;
+
+    int cnt = cnt_pro;
+    if (cnt > 0) {
+#ifdef __aarch64__
+      asm volatile(
+          "prfm   pldl1keep, [%[inptr0]]                \n"
+          "prfm   pldl1keep, [%[inptr0], #128]   \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr1], #128]   \n"
+          "prfm   pldl1keep, [%[inptr2]]                \n"
+          "prfm   pldl1keep, [%[inptr2], #128]   \n"
+          "prfm   pldl1keep, [%[inptr3]]                \n"
+          "prfm   pldl1keep, [%[inptr3], #128]   \n"
+          "1: \n"
+          "ld4 {v0.8b - v3.8b}, [%[inptr0]], #32 \n"    // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          "ld4 {v4.8b - v7.8b}, [%[inptr1]], #32 \n"    // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          "ld4 {v8.8b - v11.8b}, [%[inptr2]], #32 \n"   // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          "ld4 {v12.8b - v15.8b}, [%[inptr3]], #32 \n"  // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          // mul b
+          "umull v13.8h, v0.8b, %w[vb].8b \n"   // v0 * vb
+          "umull v14.8h, v4.8b, %w[vb].8b \n"   // v0 * vb
+          "umull v15.8h, v8.8b, %w[vb].8b \n"   // v0 * vb
+          "umull v16.8h, v12.8b, %w[vb].8b \n"  // v0 * vb
+          // mul g
+          "umull v17.8h, v1.8b, %w[vg].8b \n"   // v0 * vb
+          "umull v18.8h, v5.8b, %w[vg].8b \n"   // v0 * vb
+          "umull v19.8h, v9.8b, %w[vg].8b \n"   // v0 * vb
+          "umull v20.8h, v13.8b, %w[vg].8b \n"  // v0 * vb
+          // mul r
+          "umlal v13.8h, v2.8b, %w[vr].8b \n"   // v0 * vb
+          "umlal v14.8h, v6.8b, %w[vr].8b \n"   // v0 * vb
+          "umlal v15.8h, v10.8b, %w[vr].8b \n"  // v0 * vb
+          "umlal v16.8h, v14.8b, %w[vr].8b \n"  // v0 * vb
+          // 16->32
+          "uaddl v0.4s, v17.4h, v13.4h \n"
+          "uaddl2 v1.4s, v17.8h, v13.8h \n"
+          "uaddl v2.4s, v18.4h, v14.4h \n"
+          "uaddl2 v3.4s, v18.8h, v14.8h \n"
+          "uaddl v4.4s, v19.4h, v15.4h \n"
+          "uaddl2 v5.4s, v19.8h, v15.8h \n"
+          "uaddl v6.4s, v20.4h, v16.4h \n"
+          "uaddl2 v7.4s, v20.8h, v16.8h \n"
+          // 32->16 v0 >> 7
+          "shrn v12.4h, v0.4s, #7 \n"
+          "shrn2 v12.8h, v1.4s, #7 \n"
+          "shrn v13.4h, v2.4s, #7 \n"
+          "shrn2 v13.8h, v3.4s, #7 \n"
+          "shrn v14.4h, v4.4s, #7 \n"
+          "shrn2 v14.8h, v5.4s, #7 \n"
+          "shrn v15.4h, v6.4s, #7 \n"
+          "shrn2 v15.8h, v7.4s, #7 \n"
+          // 16->8
+          "xtn v0.8b, v12.8h \n"
+          "xtn v1.8b, v13.8h \n"
+          "xtn v2.8b, v14.8h \n"
+          "xtn v3.8b, v15.8h \n"
+          "subs %w[cnt], %w[cnt], #1 \n"
+          "st1 {v0.8b}, [%[outr0]], #8 \n"
+          "st1 {v1.8b}, [%[outr1]], #8 \n"
+          "st1 {v2.8b}, [%[outr2]], #8 \n"
+          "st1 {v3.8b}, [%[outr3]], #8 \n"
+          "bne 1b \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outr0] "+r"(outr0),
+            [outr1] "+r"(outr1),
+            [outr2] "+r"(outr2),
+            [outr3] "+r"(outr3),
+            [cnt] "+r"(cnt)
+          : [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20");
+#else
+      asm volatile(
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]            @ preload a, 64byte\n"
+          "pld [%[inptr1], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]            @ preload a, 64byte\n"
+          "pld [%[inptr2], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]            @ preload a, 64byte\n"
+          "pld [%[inptr3], #128]                         @ preload a, 64byte\n"
+          "vld1.8 d0, [%[vb]] \n"
+          "vld1.8 d1, [%[vg]] \n"
+          "vld1.8 d2, [%[vr]] \n"
+          "1: \n"
+          "vld4.8 {d3, d4, d5, d6}, [%[inptr0]]! \n"
+          "vld4.8 {d7, d8, d9, d10}, [%[inptr1]]! \n"
+          "vld4.8 {d11, d12, d13, d14}, [%[inptr2]]! \n"
+          "vld4.8 {d15, d16, d17, d18}, [%[inptr3]]! \n"
+          // vb
+          "vmull.u8 q10, d3, d0 \n"
+          "vmull.u8 q11, d7, d0 \n"
+          "vmull.u8 q12, d11, d0 \n"
+          "vmull.u8 q13, d15, d0 \n"
+          // vg
+          "vmull.u8 q14, d4, d1 \n"
+          "vmull.u8 q15, d8, d1 \n"
+          "vmull.u8 q5, d12, d1 \n"
+          "vmull.u8 q7, d16, d1 \n"
+          // vr
+          "vmlal.u8 q10, d5, d2 \n"
+          "vmlal.u8 q11, d9, d2 \n"
+          "vmlal.u8 q12, d13, d2 \n"
+          "vmlal.u8 q13, d17, d2 \n"
+          // 16->32
+          "vaddl.u16 q2, d28, d20 \n"
+          "vaddl.u16 q3, d29, d21 \n"
+          "vaddl.u16 q4, d30, d22 \n"
+          "vaddl.u16 q10, d31, d23 \n"
+          "vaddl.u16 q6, d10, d24 \n"
+          "vaddl.u16 q11, d11, d25 \n"
+          "vaddl.u16 q8, d14, d26 \n"
+          "vaddl.u16 q9, d15, d27 \n"
+          // 32->16 q2 >> 7
+          "vshrn.u32  d10, q2, #7 \n"
+          "vshrn.u32 d11, q3, #7 \n"
+          "vshrn.u32 d14, q4, #7 \n"
+          "vshrn.u32 d15, q10, #7 \n"
+          "vshrn.u32 d24, q6, #7 \n"
+          "vshrn.u32 d25, q11, #7 \n"
+          "vshrn.u32 d26, q8, #7 \n"
+          "vshrn.u32 d27, q9, #7 \n"
+          // 16->8
+          "vmovn.u16 d4, q5 \n"
+          "vmovn.u16 d5, q7 \n"
+          "vmovn.u16 d6, q12 \n"
+          "vmovn.u16 d7, q13 \n"
+          "subs %[cnt], #1 \n"
+          // store
+          "vst1.8 d4, [%[outr0]]! \n"
+          "vst1.8 d5, [%[outr1]]! \n"
+          "vst1.8 d6, [%[outr2]]! \n"
+          "vst1.8 d7, [%[outr3]]! \n"
+          "bne 1b \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outr0] "+r"(outr0),
+            [outr1] "+r"(outr1),
+            [outr2] "+r"(outr2),
+            [outr3] "+r"(outr3),
+            [cnt] "+r"(cnt)
+          : [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array)
+          : "cc",
+            "memory",
+            "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12",
+            "q13",
+            "q14",
+            "q15");
+#endif
+    }
+    for (; j < remain_pro; j++) {
+      *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7;
+      *outr1++ = (inptr1[0] * b + inptr1[1] * g + inptr1[2] * r) >> 7;
+      *outr2++ = (inptr2[0] * b + inptr2[1] * g + inptr2[2] * r) >> 7;
+      *outr3++ = (inptr3[0] * b + inptr3[1] * g + inptr3[2] * r) >> 7;
+      inptr0 += 4;
+      inptr1 += 4;
+      inptr2 += 4;
+      inptr3 += 4;
+    }
+  }
+  for (; i < srch; i++) {
+    int j = 0;
+    const uint8_t* inptr0 = src + i * win;
+    uint8_t* outr0 = dst + i * srcw;
+    for (j = 0; j < cnt_pro; j++) {
+      uint8x8x4_t y0 = vld4_u8(inptr0);  // d8 = y0y3y6y9.. d9 = y1y4y7...y
+      uint16x8_t val0 = vmull_u8(y0.val[0], vb);
+
+      uint16x8_t val0_1 = vmull_u8(y0.val[1], vg);
+
+      val0 = vmlal_u8(val0, y0.val[2], vr);
+
+      uint32x4_t v0_sum0 = vaddl_u16(vget_low_u16(val0_1), vget_low_u16(val0));
+      uint32x4_t v0_sum1 =
+          vaddl_u16(vget_high_u16(val0_1), vget_high_u16(val0));
+
+      uint16x4_t v0_sum0_16 = vshrn_n_u32(v0_sum0, 7);
+      uint16x4_t v0_sum1_16 = vshrn_n_u32(v0_sum1, 7);
+
+      uint16x8_t v0_sum = vcombine_u16(v0_sum0_16, v0_sum1_16);
+
+      uint8x8_t vout0 = vmovn_u16(v0_sum);
+
+      inptr0 += 32;
+      vst1_u8(outr0, vout0);
+      outr0 += 8;
+    }
+    for (; j < srcw; j++) {
+      *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7;
+      inptr0 += 4;
+    }
+  }
+}
+/*
 采用CV_GRAY2BGR,转换公式B = G = R = Gray
 采用CV_GRAY2RGB,转换公式R = G = B = Gray
 gray2bgr, gray2rgb
@@ -1091,6 +1375,22 @@ void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
     }
   }
 }
+/*
+采用CV_GRAY2BGRA,转换公式B = G = R = Gray A=255
+采用CV_GRAY2RGBA,转换公式R = G = B = Gray A=255
+gray2bgra, gray2rgba
+*/
+void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  for (int i = 0; i < srch; i++) {
+    for (int j = 0; j < srcw; j++) {
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = 255;
+      src++;
+    }
+  }
+}
 // bgr2bgra, rgb2rgba
 void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   for (int i = 0; i < srch; i++) {
diff --git a/lite/utils/cv/image_flip.cc b/lite/utils/cv/image_flip.cc
index fd84691a2d1d244350f40238bc137d5d159ba62b..f535c858e4dddcd04a0ce8cfa7a727356df34d64 100644
--- a/lite/utils/cv/image_flip.cc
+++ b/lite/utils/cv/image_flip.cc
@@ -19,6 +19,23 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void ImageFlip::choose(const uint8_t* src,
+                       uint8_t* dst,
+                       ImageFormat srcFormat,
+                       int srcw,
+                       int srch,
+                       FlipParam flip_param) {
+  if (srcFormat == GRAY) {
+    flip_hwc1(src, dst, srcw, srch, flip_param);
+  } else if (srcFormat == BGR || srcFormat == RGB) {
+    flip_hwc3(src, dst, srcw, srch, flip_param);
+  } else if (srcFormat == BGRA || srcFormat == RGBA) {
+    flip_hwc4(src, dst, srcw, srch, flip_param);
+  } else {
+    printf("this srcFormat: %d does not support! \n", srcFormat);
+    return;
+  }
+}
 // gray
 void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
 void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
@@ -43,6 +60,9 @@ void flip_hwc1(const uint8_t* src,
     flip_hwc1_y(src, dst, srcw, srch);
   } else if (flip_param == XY) {
     flip_hwc1_xy(src, dst, srcw, srch);
+  } else {
+    printf("its doesn't support Flip: %d \n", static_cast<int>(flip_param));
+    return;
   }
 }
 
@@ -57,6 +77,9 @@ void flip_hwc3(const uint8_t* src,
     flip_hwc3_y(src, dst, srcw, srch);
   } else if (flip_param == XY) {
     flip_hwc3_xy(src, dst, srcw, srch);
+  } else {
+    printf("its doesn't support Flip: %d \n", static_cast<int>(flip_param));
+    return;
   }
 }
 
@@ -71,6 +94,9 @@ void flip_hwc4(const uint8_t* src,
     flip_hwc4_y(src, dst, srcw, srch);
   } else if (flip_param == XY) {
     flip_hwc4_xy(src, dst, srcw, srch);
+  } else {
+    printf("its doesn't support Flip: %d \n", static_cast<int>(flip_param));
+    return;
   }
 }
 /*
diff --git a/lite/utils/cv/image_flip.h b/lite/utils/cv/image_flip.h
index 5e513324a179423ec1d008d6e6cd33d29a79c095..7215b9494a36d50cba787be7e53253d704bde8bd 100644
--- a/lite/utils/cv/image_flip.h
+++ b/lite/utils/cv/image_flip.h
@@ -21,6 +21,15 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+class ImageFlip {
+ public:
+  void choose(const uint8_t* src,
+              uint8_t* dst,
+              ImageFormat srcFormat,
+              int srcw,
+              int srch,
+              FlipParam flip_param);
+};
 void flip_hwc1(
     const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param);
 void flip_hwc3(
diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc
index 8b0b8aa17d3ced769c7ff606e9ba5fe78208b3d7..cd02a2cf4bd0bdfa0f2c45ed2cf0b1ead803480c 100644
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -38,6 +38,15 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void ImageResize::choose(const uint8_t* src,
+                         uint8_t* dst,
+                         ImageFormat srcFormat,
+                         int srcw,
+                         int srch,
+                         int dstw,
+                         int dsth) {
+  resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
+}
 void compute_xy(int srcw,
                 int srch,
                 int dstw,
diff --git a/lite/utils/cv/image_resize.h b/lite/utils/cv/image_resize.h
index e2e399f542c3b00eaf6a3b09f6315b38518f409f..f11f7b5d93159509ca9069f409335e6530060383 100644
--- a/lite/utils/cv/image_resize.h
+++ b/lite/utils/cv/image_resize.h
@@ -39,6 +39,16 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+class ImageResize {
+ public:
+  void choose(const uint8_t* src,
+              uint8_t* dst,
+              ImageFormat srcFormat,
+              int srcw,
+              int srch,
+              int dstw,
+              int dsth);
+};
 void resize(const uint8_t* src,
             uint8_t* dst,
             ImageFormat srcFormat,
diff --git a/lite/utils/cv/image_rotate.cc b/lite/utils/cv/image_rotate.cc
index 04ba84076685f89c376203d69ea631afe03671ec..98e61fb444aad691d28ae2116dbbd5743e20e481 100644
--- a/lite/utils/cv/image_rotate.cc
+++ b/lite/utils/cv/image_rotate.cc
@@ -19,6 +19,26 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void ImageRotate::choose(const uint8_t* src,
+                         uint8_t* dst,
+                         ImageFormat srcFormat,
+                         int srcw,
+                         int srch,
+                         float degree) {
+  if (degree != 90 && degree != 180 && degree != 270) {
+    printf("this degree: %f not support \n", degree);
+  }
+  if (srcFormat == GRAY) {
+    rotate_hwc1(src, dst, srcw, srch, degree);
+  } else if (srcFormat == BGR || srcFormat == RGB) {
+    rotate_hwc3(src, dst, srcw, srch, degree);
+  } else if (srcFormat == BGRA || srcFormat == RGBA) {
+    rotate_hwc4(src, dst, srcw, srch, degree);
+  } else {
+    printf("this srcFormat: %d does not support! \n", srcFormat);
+    return;
+  }
+}
 // gray
 void rotate_hwc1_90(
     const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
@@ -50,6 +70,9 @@ void rotate_hwc1(
     rotate_hwc1_180(src, dst, srcw, srch, srcw, srch);
   } else if (degree == 270) {
     rotate_hwc1_270(src, dst, srcw, srch, srch, srcw);
+  } else {
+    printf("this degree: %f does not support! \n", degree);
+    return;
   }
 }
 
@@ -61,6 +84,9 @@ void rotate_hwc3(
     rotate_hwc3_180(src, dst, srcw, srch, srcw, srch);
   } else if (degree == 270) {
     rotate_hwc3_270(src, dst, srcw, srch, srch, srcw);
+  } else {
+    printf("this degree: %f does not support! \n", degree);
+    return;
   }
 }
 
@@ -72,6 +98,9 @@ void rotate_hwc4(
     rotate_hwc4_180(src, dst, srcw, srch, srcw, srch);
   } else if (degree == 270) {
     rotate_hwc4_270(src, dst, srcw, srch, srch, srcw);
+  } else {
+    printf("this degree: %f does not support! \n", degree);
+    return;
   }
 }
 #ifdef __aarch64__
@@ -578,6 +607,7 @@ void rotate_hwc1_90(const uint8_t* src,
   int stride_h = 4 * w_in;
   int stride_h_w = 4 * w_in - 8;
   int stride_out = 4 * w_out;
+  int ww = w_out - 8;
 #pragma omp parallel for
   for (i = 0; i < h_in - 7; i += 8) {
     const uint8_t* inptr0 = src + i * w_in;
@@ -586,7 +616,7 @@ void rotate_hwc1_90(const uint8_t* src,
     const uint8_t* inptr3 = inptr2 + w_in;
     int j = 0;
     for (; j < w_in - 7; j += 8) {
-      uint8_t* outptr0 = dst + j * w_out + i;
+      uint8_t* outptr0 = dst + j * w_out + (ww - i);
       uint8_t* outptr1 = outptr0 + w_out;
       uint8_t* outptr2 = outptr1 + w_out;
       uint8_t* outptr3 = outptr2 + w_out;
@@ -648,7 +678,7 @@ void rotate_hwc1_90(const uint8_t* src,
     const uint8_t* inptr6 = inptr5 + w_in;
     const uint8_t* inptr7 = inptr6 + w_in;
     for (; j < w_in; j++) {
-      uint8_t* outptr = dst + j * w_out + i;
+      uint8_t* outptr = dst + j * w_out + ww - i;
       *outptr++ = *inptr0++;
       *outptr++ = *inptr1++;
       *outptr++ = *inptr2++;
@@ -659,10 +689,11 @@ void rotate_hwc1_90(const uint8_t* src,
       *outptr++ = *inptr7++;
     }
   }
+  ww = w_out - 1;
   for (; i < h_in; i++) {
     const uint8_t* inptr0 = src + i * w_in;
     for (int j = 0; j < w_in; j++) {
-      uint8_t* outptr0 = dst + j * w_out + i;
+      uint8_t* outptr0 = dst + j * w_out + ww - i;
       *outptr0 = *inptr0++;
     }
   }
@@ -693,9 +724,9 @@ void rotate_hwc1_180(const uint8_t* src,
     const uint8_t* inptr3 = inptr2 + w_in;
 
     uint8_t* outptr0 = dst + (h_in - i) * w_out - stride_w;  // last
-    uint8_t* outptr1 = outptr0 + w_out;
-    uint8_t* outptr2 = outptr1 + w_out;
-    uint8_t* outptr3 = outptr2 + w_out;
+    uint8_t* outptr1 = outptr0 - w_out;
+    uint8_t* outptr2 = outptr1 - w_out;
+    uint8_t* outptr3 = outptr2 - w_out;
 
     if (i + 3 >= h_in) {
       uint8_t* ptr = zerobuff + w_in - stride_w;
diff --git a/lite/utils/cv/image_rotate.h b/lite/utils/cv/image_rotate.h
index 8335fca28051c3ba0ae5070464c32d5e804361f4..8e04a3f5244ab5740f9ee1b0e3586cdcea7aa32a 100644
--- a/lite/utils/cv/image_rotate.h
+++ b/lite/utils/cv/image_rotate.h
@@ -16,10 +16,20 @@
 
 #include <stdint.h>
 #include <vector>
+#include "lite/utils/cv/paddle_image_preprocess.h"
 namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+class ImageRotate {
+ public:
+  void choose(const uint8_t* src,
+              uint8_t* dst,
+              ImageFormat srcFormat,
+              int srcw,
+              int srch,
+              float degree);
+};
 void rotate_hwc1(
     const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree);
 void rotate_hwc3(
diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc
index 0bccfe2804a9ba17473575815bfe4b2e9635f234..c46811a046a19a50592097fb987280ad19608193 100644
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
@@ -25,7 +25,6 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
-
 #define PI 3.14159265f
 #define Degrees2Radians(degrees) ((degrees) * (SK_ScalarPI / 180))
 #define Radians2Degrees(radians) ((radians) * (180 / SK_ScalarPI))
@@ -38,7 +37,7 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat,
   this->dstFormat_ = dstFormat;
   this->transParam_ = param;
 }
-void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) {
+void ImagePreprocess::imageConvert(const uint8_t* src, uint8_t* dst) {
   ImageConvert img_convert;
   img_convert.choose(src,
                      dst,
@@ -48,10 +47,10 @@ void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) {
                      this->transParam_.ih);
 }
 
-void ImagePreprocess::imageCovert(const uint8_t* src,
-                                  uint8_t* dst,
-                                  ImageFormat srcFormat,
-                                  ImageFormat dstFormat) {
+void ImagePreprocess::imageConvert(const uint8_t* src,
+                                   uint8_t* dst,
+                                   ImageFormat srcFormat,
+                                   ImageFormat dstFormat) {
   ImageConvert img_convert;
   img_convert.choose(src,
                      dst,
@@ -68,241 +67,8 @@ void ImagePreprocess::imageResize(const uint8_t* src,
                                   int srch,
                                   int dstw,
                                   int dsth) {
-  resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
-  /*
-    int size = srcw * srch;
-    if (srcw == dstw && srch == dsth) {
-      if (srcFormat == NV12 || srcFormat == NV21) {
-        size = srcw * (floor(1.5 * srch));
-      } else if (srcFormat == BGR || srcFormat == RGB) {
-        size = 3 * srcw * srch;
-      } else if (srcFormat == BGRA || srcFormat == RGBA) {
-        size = 4 * srcw * srch;
-      }
-      memcpy(dst, src, sizeof(uint8_t) * size);
-      return;
-    }
-    double scale_x = static_cast<double>(srcw / dstw);
-    double scale_y = static_cast<double>(srch / dsth);
-
-    int* buf = new int[dstw * 2 + dsth * 2];
-
-    int* xofs = buf;
-    int* yofs = buf + dstw;
-    int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
-    int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
-
-    compute_xy(
-        srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
-
-    int w_out = dstw;
-    int w_in = srcw;
-    int num = 1;
-    int orih = dsth;
-    if (srcFormat == GRAY) {
-      num = 1;
-    } else if (srcFormat == NV12 || srcFormat == NV21) {
-      num = 1;
-      int hout = static_cast<int>(0.5 * dsth);
-      dsth += hout;
-    } else if (srcFormat == BGR || srcFormat == RGB) {
-      w_in = srcw * 3;
-      w_out = dstw * 3;
-      num = 3;
-
-    } else if (srcFormat == BGRA || srcFormat == RGBA) {
-      w_in = srcw * 4;
-      w_out = dstw * 4;
-      num = 4;
-    }
-
-    int* xofs1 = nullptr;
-    int* yofs1 = nullptr;
-    int16_t* ialpha1 = nullptr;
-    if (orih < dsth) {  // uv
-      int tmp = dsth - orih;
-      int w = dstw / 2;
-      xofs1 = new int[w];
-      yofs1 = new int[tmp];
-      ialpha1 = new int16_t[srcw];
-      compute_xy(srcw / 2,
-                 srch / 2,
-                 w,
-                 tmp,
-                 scale_x,
-                 scale_y,
-                 xofs1,
-                 yofs1,
-                 ialpha1,
-                 ibeta + orih);
-    }
-    int cnt = w_out >> 3;
-    int remain = w_out % 8;
-    int32x4_t _v2 = vdupq_n_s32(2);
-  #pragma omp parallel for
-    for (int dy = 0; dy < dsth; dy++) {
-      int16_t* rowsbuf0 = new int16_t[w_out];
-      int16_t* rowsbuf1 = new int16_t[w_out];
-      int sy = yofs[dy];
-      if (dy >= orih) {
-        xofs = xofs1;
-        yofs = yofs1;
-        ialpha = ialpha1;
-      }
-      if (sy < 0) {
-        memset(rowsbuf0, 0, sizeof(uint16_t) * w_out);
-        const uint8_t* S1 = src + srcw * (sy + 1);
-        const int16_t* ialphap = ialpha;
-        int16_t* rows1p = rowsbuf1;
-        for (int dx = 0; dx < dstw; dx++) {
-          int sx = xofs[dx] * num;  // num = 4
-          int16_t a0 = ialphap[0];
-          int16_t a1 = ialphap[1];
-
-          const uint8_t* S1pl = S1 + sx;
-          const uint8_t* S1pr = S1 + sx + num;
-          if (sx < 0) {
-            S1pl = S1;
-          }
-          for (int i = 0; i < num; i++) {
-            if (sx < 0) {
-              *rows1p++ = ((*S1pl++) * a1) >> 4;
-            } else {
-              *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-            }
-          }
-          ialphap += 2;
-        }
-      } else {
-        // hresize two rows
-        const uint8_t* S0 = src + w_in * (sy);
-        const uint8_t* S1 = src + w_in * (sy + 1);
-        const int16_t* ialphap = ialpha;
-        int16_t* rows0p = rowsbuf0;
-        int16_t* rows1p = rowsbuf1;
-        for (int dx = 0; dx < dstw; dx++) {
-          int sx = xofs[dx] * num;  // num = 4
-          int16_t a0 = ialphap[0];
-          int16_t a1 = ialphap[1];
-
-          const uint8_t* S0pl = S0 + sx;
-          const uint8_t* S0pr = S0 + sx + num;
-          const uint8_t* S1pl = S1 + sx;
-          const uint8_t* S1pr = S1 + sx + num;
-          if (sx < 0) {
-            S0pl = S0;
-            S1pl = S1;
-          }
-          for (int i = 0; i < num; i++) {
-            if (sx < 0) {
-              *rows0p = ((*S0pl++) * a1) >> 4;
-              *rows1p = ((*S1pl++) * a1) >> 4;
-              rows0p++;
-              rows1p++;
-            } else {
-              *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
-              *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-            }
-          }
-          ialphap += 2;
-        }
-      }
-      int ind = dy * 2;
-      int16_t b0 = ibeta[ind];
-      int16_t b1 = ibeta[ind + 1];
-      int16x8_t _b0 = vdupq_n_s16(b0);
-      int16x8_t _b1 = vdupq_n_s16(b1);
-      uint8_t* dp_ptr = dst + dy * w_out;
-      int16_t* rows0p = rowsbuf0;
-      int16_t* rows1p = rowsbuf1;
-      int re_cnt = cnt;
-      if (re_cnt > 0) {
-  #ifdef __aarch64__
-        asm volatile(
-            "1: \n"
-            "ld1 {v0.8h}, [%[rows0p]], #16 \n"
-            "ld1 {v1.8h}, [%[rows1p]], #16 \n"
-            "orr v6.16b, %w[_v2].16b, %w[_v2].16b \n"
-            "orr v7.16b, %w[_v2].16b, %w[_v2].16b \n"
-            "smull v2.4s, v0.4h, %w[_b0].4h \n"
-            "smull2 v4.4s, v0.8h, %w[_b0].8h \n"
-            "smull v3.4s, v1.4h, %w[_b1].4h \n"
-            "smull2 v5.4s, v1.8h, %w[_b1].8h \n"
-
-            "ssra v6.4s, v2.4s, #16 \n"
-            "ssra v7.4s, v4.4s, #16 \n"
-            "ssra v6.4s, v3.4s, #16 \n"
-            "ssra v7.4s, v5.4s, #16 \n"
-
-            "shrn v0.4h, v6.4s, #2 \n"
-            "shrn2 v0.8h, v7.4s, #2 \n"
-            "subs %w[cnt], %w[cnt], #1 \n"
-            "sqxtun v1.8b, v0.8h \n"
-            "st1 {v1.8b}, [%[dp]], #8 \n"
-            "bne 1b \n"
-            : [rows0p] "+r"(rows0p),
-              [rows1p] "+r"(rows1p),
-              [cnt] "+r"(re_cnt),
-              [dp] "+r"(dp_ptr)
-            : [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-  #else
-        asm volatile(
-            "mov        r4, #2          \n"
-            "vdup.s32   q12, r4         \n"
-            "0:                         \n"
-            "vld1.s16   {d2-d3}, [%[rows0p]]!\n"
-            "vld1.s16   {d6-d7}, [%[rows1p]]!\n"
-            "vorr.s32   q10, q12, q12   \n"
-            "vorr.s32   q11, q12, q12   \n"
-
-            "vmull.s16  q0, d2, %[_b0]     \n"
-            "vmull.s16  q1, d3, %[_b0]     \n"
-            "vmull.s16  q2, d6, %[_b1]     \n"
-            "vmull.s16  q3, d7, %[_b1]     \n"
-
-            "vsra.s32   q10, q0, #16    \n"
-            "vsra.s32   q11, q1, #16    \n"
-            "vsra.s32   q10, q2, #16    \n"
-            "vsra.s32   q11, q3, #16    \n"
-
-            "vshrn.s32  d20, q10, #2    \n"
-            "vshrn.s32  d21, q11, #2    \n"
-            "subs       %[cnt], #1          \n"
-            "vqmovun.s16 d20, q10        \n"
-            "vst1.8     {d20}, [%[dp]]!    \n"
-            "bne        0b              \n"
-            : [rows0p] "+r"(rows0p),
-              [rows1p] "+r"(rows1p),
-              [cnt] "+r"(re_cnt),
-              [dp] "+r"(dp_ptr)
-            : [_b0] "w"(_b0), [_b1] "w"(_b1)
-            : "cc",
-              "memory",
-              "r4",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12");
-
-  #endif  // __aarch64__
-      }
-      for (int i = 0; i < remain; i++) {
-        //             D[x] = (rows0[x]*b0 + rows1[x]*b1) >>
-        //             INTER_RESIZE_COEF_BITS;
-        *dp_ptr++ =
-            (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
-                       (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
-                      2);
-      }
-    }
-    delete[] buf;
-    */
+  ImageResize img_resize;
+  img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
 
 void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
@@ -311,7 +77,8 @@ void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
   int dstw = this->transParam_.ow;
   int dsth = this->transParam_.oh;
   auto srcFormat = this->dstFormat_;
-  resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
+  ImageResize img_resize;
+  img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
 
 void ImagePreprocess::imageRotate(const uint8_t* src,
@@ -320,19 +87,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src,
                                   int srcw,
                                   int srch,
                                   float degree) {
-  if (degree != 90 && degree != 180 && degree != 270) {
-    printf("this degree: %f not support \n", degree);
-  }
-  if (srcFormat == GRAY) {
-    rotate_hwc1(src, dst, srcw, srch, degree);
-  } else if (srcFormat == BGR || srcFormat == RGB) {
-    rotate_hwc3(src, dst, srcw, srch, degree);
-  } else if (srcFormat == BGRA || srcFormat == RGBA) {
-    rotate_hwc4(src, dst, srcw, srch, degree);
-  } else {
-    printf("this srcFormat: %d does not support! \n", srcFormat);
-    return;
-  }
+  ImageRotate img_rotate;
+  img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }
 
 void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) {
@@ -340,10 +96,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) {
   auto srch = this->transParam_.oh;
   auto srcFormat = this->dstFormat_;
   auto degree = this->transParam_.rotate_param;
-  if (degree != 90 && degree != 180 && degree != 270) {
-    printf("this degree: %f not support \n", degree);
-  }
-  ImagePreprocess::imageRotate(src, dst, srcFormat, srcw, srch, degree);
+  ImageRotate img_rotate;
+  img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }
 
 void ImagePreprocess::imageFlip(const uint8_t* src,
@@ -352,16 +106,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src,
                                 int srcw,
                                 int srch,
                                 FlipParam flip_param) {
-  if (srcFormat == GRAY) {
-    flip_hwc1(src, dst, srcw, srch, flip_param);
-  } else if (srcFormat == BGR || srcFormat == RGB) {
-    flip_hwc3(src, dst, srcw, srch, flip_param);
-  } else if (srcFormat == BGRA || srcFormat == RGBA) {
-    flip_hwc4(src, dst, srcw, srch, flip_param);
-  } else {
-    printf("this srcFormat: %d does not support! \n", srcFormat);
-    return;
-  }
+  ImageFlip img_flip;
+  img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }
 
 void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) {
@@ -369,7 +115,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) {
   auto srch = this->transParam_.oh;
   auto srcFormat = this->dstFormat_;
   auto flip_param = this->transParam_.flip_param;
-  ImagePreprocess::imageFlip(src, dst, srcFormat, srcw, srch, flip_param);
+  ImageFlip img_flip;
+  img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }
 
 void ImagePreprocess::image2Tensor(const uint8_t* src,
diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h
index 11673e19041883bfa6ca7a45f03ca3feca76dd20..a12c0d11f067fc3e807682f9a213d3024def97e0 100644
--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
@@ -19,6 +19,7 @@
 #include <vector>
 #include "lite/api/paddle_api.h"
 #include "lite/api/paddle_place.h"
+
 namespace paddle {
 namespace lite {
 namespace utils {
@@ -37,9 +38,9 @@ enum ImageFormat {
 };
 // flip enum
 enum FlipParam {
-  X = 0,  // flip along the X axis
-  Y,      // flip along the Y axis
-  XY      // flip along the XY axis
+  XY = -1,  // flip along the XY axis
+  X = 0,    // flip along the X axis
+  Y         // flip along the Y axis
 };
 // transform param
 typedef struct {
@@ -69,11 +70,12 @@ class ImagePreprocess {
   * BGR(RGB)and BGRA(RGBA) transform,
   * BGR(RGB)and RGB(BGR) transform,
   * BGR(RGB)and RGBA(BGRA) transform,
-  * BGR(RGB)and GRAY transform,
+  * BGR(RGB) and GRAY transform,
+  * BGRA(RGBA) and GRAY transform,
   * param src: input image data
   * param dst: output image data
   */
-  void imageCovert(const uint8_t* src, uint8_t* dst);
+  void imageConvert(const uint8_t* src, uint8_t* dst);
   /*
   * image color convert
   * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
@@ -81,6 +83,7 @@ class ImagePreprocess {
   * BGR(RGB)and RGB(BGR) transform,
   * BGR(RGB)and RGBA(BGRA) transform,
   * BGR(RGB)and GRAY transform,
+  * BGRA(RGBA) and GRAY transform,
   * param src: input image data
   * param dst: output image data
   * param srcFormat: input image image format support: GRAY, NV12(NV21),
@@ -88,10 +91,10 @@ class ImagePreprocess {
   * param dstFormat: output image image format, support GRAY, BGR(RGB) and
   * BGRA(RGBA)
   */
-  void imageCovert(const uint8_t* src,
-                   uint8_t* dst,
-                   ImageFormat srcFormat,
-                   ImageFormat dstFormat);
+  void imageConvert(const uint8_t* src,
+                    uint8_t* dst,
+                    ImageFormat srcFormat,
+                    ImageFormat dstFormat);
   /*
   * image resize, use bilinear method
   * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
@@ -133,7 +136,7 @@ class ImagePreprocess {
   * color format support 1-channel image, 3-channel image and 4-channel image
   * param src: input image data
   * param dst: output image data
-  * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param degree: Rotate degree, support 90, 180 and 270
@@ -158,7 +161,7 @@ class ImagePreprocess {
   * color format support 1-channel image, 3-channel image and 4-channel image
   * param src: input image data
   * param dst: output image data
-  * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param flip_param: flip parameter, support X, Y and XY
@@ -171,7 +174,8 @@ class ImagePreprocess {
                  FlipParam flip_param);
   /*
   * change image data to tensor data
-  * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and
+  * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
+  * and
   * NCHW
   * param src: input image data
   * param dstTensor: output tensor data
@@ -186,11 +190,12 @@ class ImagePreprocess {
                     float* scales);
   /*
    * change image data to tensor data
-  * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and
+  * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
+  * and
   * NCHW
   * param src: input image data
   * param dstTensor: output tensor data
-  * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param layout: output tensor layout，support NHWC and NCHW
diff --git a/lite/utils/env.h b/lite/utils/env.h
new file mode 100644
index 0000000000000000000000000000000000000000..86af8c9e7e0749e75b35bbf23ff4c1d903ad5764
--- /dev/null
+++ b/lite/utils/env.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstdlib>
+#include <cstring>
+
+#include <iostream>
+#include <string>
+
+namespace paddle {
+namespace lite {
+
+static std::string GetStringFromEnv(const std::string& str,
+                                    const std::string& def = "") {
+  char* variable = std::getenv(str.c_str());
+  if (!variable) {
+    return def;
+  }
+  return std::string(variable);
+}
+
+static bool GetBoolFromEnv(const std::string& str, bool def = false) {
+  char* variable = std::getenv(str.c_str());
+  if (!variable) {
+    return def;
+  }
+  if (strcmp(variable, "false") == 0 || strcmp(variable, "0") == 0) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+static int GetIntFromEnv(const std::string& str, int def = 0) {
+  char* variable = std::getenv(str.c_str());
+  if (!variable) {
+    return def;
+  }
+  return atoi(variable);
+}
+
+static double GetDoubleFromEnv(const std::string& str, double def = 0.0) {
+  char* variable = std::getenv(str.c_str());
+  if (!variable) {
+    return def;
+  }
+  return atof(variable);
+}
+
+static uint64_t GetUInt64FromEnv(const std::string& str, uint64_t def = 0ul) {
+  char* variable = std::getenv(str.c_str());
+  if (!variable) {
+    return def;
+  }
+  return static_cast<uint64_t>(atol(variable));
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/utils/io.h b/lite/utils/io.h
index 98a0f39b084c1ec0767299501f6f359dab2017b3..92405cae862f062090665aecc8eb7f207cf059e7 100644
--- a/lite/utils/io.h
+++ b/lite/utils/io.h
@@ -14,9 +14,12 @@
 
 #pragma once
 
+#include <dirent.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #include <fstream>
 #include <string>
+#include <vector>
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 
@@ -46,11 +49,68 @@ static void MkDirRecur(const std::string& path) {
 // read buffer from file
 static std::string ReadFile(const std::string& filename) {
   std::ifstream ifile(filename.c_str());
+  if (!ifile.is_open()) {
+    LOG(FATAL) << "Open file: [" << filename << "] failed.";
+  }
   std::ostringstream buf;
   char ch;
   while (buf && ifile.get(ch)) buf.put(ch);
+  ifile.close();
   return buf.str();
 }
 
+// read lines from file
+static std::vector<std::string> ReadLines(const std::string& filename) {
+  std::ifstream ifile(filename.c_str());
+  if (!ifile.is_open()) {
+    LOG(FATAL) << "Open file: [" << filename << "] failed.";
+  }
+  std::vector<std::string> res;
+  std::string tmp;
+  while (getline(ifile, tmp)) res.push_back(tmp);
+  ifile.close();
+  return res;
+}
+
+static void WriteLines(const std::vector<std::string>& lines,
+                       const std::string& filename) {
+  std::ofstream ofile(filename.c_str());
+  if (!ofile.is_open()) {
+    LOG(FATAL) << "Open file: [" << filename << "] failed.";
+  }
+  for (const auto& line : lines) {
+    ofile << line << "\n";
+  }
+  ofile.close();
+}
+
+static bool IsDir(const std::string& path) {
+  DIR* dir_fd = opendir(path.c_str());
+  if (dir_fd == nullptr) return false;
+  closedir(dir_fd);
+  return true;
+}
+
+static std::vector<std::string> ListDir(const std::string& path,
+                                        bool only_dir = false) {
+  if (!IsDir(path)) {
+    LOG(FATAL) << "[" << path << "] is not a valid dir path.";
+  }
+
+  std::vector<std::string> paths;
+  DIR* parent_dir_fd = opendir(path.c_str());
+  dirent* dp;
+  while ((dp = readdir(parent_dir_fd)) != nullptr) {
+    // Exclude '.', '..' and hidden dir
+    std::string name(dp->d_name);
+    if (name == "." || name == ".." || name[0] == '.') continue;
+    if (IsDir(Join<std::string>({path, name}, "/"))) {
+      paths.push_back(name);
+    }
+  }
+  closedir(parent_dir_fd);
+  return paths;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc
index 6098e1f5b067457db6372071f4f5140c6e39000c..d821078e366b1ade8b093e08a63829bcf35c1376 100644
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/utils/replace_stl/stream.h"
+#include <assert.h>
 #include <stdio.h>
 
 #ifdef LITE_ON_TINY_PUBLISH
diff --git a/mobile/src/common/log.h b/mobile/src/common/log.h
index dde50b61707192b9987da6aa56c2117a2bd70c06..3b42188b62278c0acde41d52d68cc4b48ee6cda9 100644
--- a/mobile/src/common/log.h
+++ b/mobile/src/common/log.h
@@ -33,49 +33,71 @@ namespace paddle_mobile {
 
 static const char *ANDROID_LOG_TAG =
     "paddle_mobile LOG built on " __DATE__ " " __TIME__;
-
+#ifdef PADDLE_ENABLE_COLORABLE_LOG
+#define PADDLE_RED "\033[1;31;40m"
+#define PADDLE_GREEN "\033[1;32;40m"
+#define PADDLE_YELLOW "\033[1;33;40m"
+#define PADDLE_LIGHT_RED "\033[1;35;40m"
+#define PADDLE_BLUE "\033[1;34;40m"
+#define PADDLE_WHITE "\033[1;37;40m"
+#define PADDLE_CONON "\033[0m"
+#else
+#define PADDLE_RED ""
+#define PADDLE_GREEN ""
+#define PADDLE_YELLOW ""
+#define PADDLE_LIGHT_RED ""
+#define PADDLE_BLUE ""
+#define PADDLE_WHITE ""
+#define PADDLE_CONON ""
+#endif
 #define ANDROIDLOGI(...)                                               \
   __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                \
+  fprintf(stderr, PADDLE_YELLOW "%s\n" PADDLE_CONON, __VA_ARGS__);     \
   fflush(stderr)
-#define ANDROIDLOGW(...)                                                  \
-  __android_log_print(ANDROID_LOG_WARNING, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                   \
+#define ANDROIDLOGW(...)                                               \
+  __android_log_print(ANDROID_LOG_WARN, ANDROID_LOG_TAG, __VA_ARGS__); \
+  fprintf(stderr, PADDLE_LIGHT_RED "%s\n" PADDLE_CONON, __VA_ARGS__);  \
   fflush(stderr)
 #define ANDROIDLOGD(...)                                                \
   __android_log_print(ANDROID_LOG_DEBUG, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                 \
+  fprintf(stderr, PADDLE_WHITE "%s\n" PADDLE_CONON, __VA_ARGS__);       \
   fflush(stderr)
 #define ANDROIDLOGE(...)                                                \
   __android_log_print(ANDROID_LOG_ERROR, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                 \
+  fprintf(stderr, PADDLE_RED "%s\n" PADDLE_CONON, __VA_ARGS__);         \
+  fflush(stderr)
+#define ANDROIDLOGV(...)                                                  \
+  __android_log_print(ANDROID_LOG_VERBOSE, ANDROID_LOG_TAG, __VA_ARGS__); \
+  fprintf(stderr, PADDLE_GREEN "%s\n" PADDLE_CONON, __VA_ARGS__);         \
   fflush(stderr)
 #else
 #define ANDROIDLOGI(...)
 #define ANDROIDLOGW(...)
 #define ANDROIDLOGD(...)
 #define ANDROIDLOGE(...)
+#define ANDROIDLOGV(...)
 
 #endif
 
 enum LogLevel {
-  kNO_LOG,
   kLOG_ERROR,
   kLOG_WARNING,
   kLOG_INFO,
+  kLOG_VERBOSE,
   kLOG_DEBUG,
   kLOG_DEBUG1,
   kLOG_DEBUG2,
   kLOG_DEBUG3,
-  kLOG_DEBUG4
+  kLOG_DEBUG4,
+  kNO_LOG,
 };
 
 // log level
 static LogLevel log_level = kLOG_DEBUG4;
 
-static std::vector<std::string> logs{"NO",      "ERROR ",  "WARNING",
-                                     "INFO   ", "DEBUG  ", "DEBUG1 ",
-                                     "DEBUG2 ", "DEBUG3 ", "DEBUG4 "};
+static std::vector<std::string> logs{"ERROR  ", "WARNING", "INFO   ", "VERBOSE",
+                                     "DEBUG  ", "DEBUG1 ", "DEBUG2 ", "DEBUG3 ",
+                                     "DEBUG4 ", "NO     "};
 struct ToLog;
 struct Print;
 
@@ -97,9 +119,27 @@ struct Print {
 #else
       std::cerr << buffer_.str() << std::endl;
 #endif
-    } else {
+    } else if (level == kLOG_INFO) {
 #ifdef ANDROID
       ANDROIDLOGI(buffer_.str().c_str());
+#else
+      std::cerr << buffer_.str() << std::endl;
+#endif
+    } else if (level == kLOG_VERBOSE) {
+#ifdef ANDROID
+      ANDROIDLOGV(buffer_.str().c_str());
+#else
+      std::cerr << buffer_.str() << std::endl;
+#endif
+    } else if (level == kLOG_WARNING) {
+#ifdef ANDROID
+      ANDROIDLOGW(buffer_.str().c_str());
+#else
+      std::cerr << buffer_.str() << std::endl;
+#endif
+    } else {
+#ifdef ANDROID
+      ANDROIDLOGD(buffer_.str().c_str());
 #else
       std::cout << buffer_.str() << std::endl;
 #endif
@@ -131,6 +171,7 @@ struct ToLog {
 
 #define LOG(level)                                                           \
   if (level > paddle_mobile::log_level) {                                    \
+    /* NOLINTNEXTLINE */                                                     \
   } else                                                                     \
     paddle_mobile::ToLog(                                                    \
         level, static_cast<const std::stringstream &>(                       \
@@ -143,6 +184,7 @@ struct ToLog {
 
 #define DLOG                                                          \
   if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {         \
+    /* NOLINTNEXTLINE */                                              \
   } else                                                              \
     paddle_mobile::ToLog(                                             \
         paddle_mobile::kLOG_DEBUG,                                    \
@@ -156,11 +198,13 @@ struct ToLog {
 
 #define LOGF(level, format, ...)          \
   if (level > paddle_mobile::log_level) { \
+    /* NOLINTNEXTLINE */                  \
   } else                                  \
     printf(format, ##__VA_ARGS__)
 
 #define DLOGF(format, ...)                                    \
   if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \
+    /* NOLINTNEXTLINE */                                      \
   } else                                                      \
     printf(format, ##__VA_ARGS__)
 
@@ -170,17 +214,19 @@ struct ToLog {
 #define ANDROIDLOGW(...)
 #define ANDROIDLOGD(...)
 #define ANDROIDLOGE(...)
+#define ANDROIDLOGV(...)
 
 enum LogLevel {
-  kNO_LOG,
   kLOG_ERROR,
   kLOG_WARNING,
   kLOG_INFO,
+  kLOG_VERBOSE,
   kLOG_DEBUG,
   kLOG_DEBUG1,
   kLOG_DEBUG2,
   kLOG_DEBUG3,
-  kLOG_DEBUG4
+  kLOG_DEBUG4,
+  kNO_LOG
 };
 
 struct ToLog;
@@ -193,7 +239,7 @@ struct Print {
 };
 
 struct ToLog {
-  ToLog(LogLevel level) {}
+  explicit ToLog(LogLevel level) {}
 
   template <typename T>
   ToLog &operator<<(T const &value) {
@@ -201,14 +247,16 @@ struct ToLog {
   }
 };
 
-#define LOG(level) \
-  if (true) {      \
-  } else           \
+#define LOG(level)       \
+  if (true) {            \
+    /* NOLINTNEXTLINE */ \
+  } else                 \
     paddle_mobile::ToLog(level)
 
-#define DLOG  \
-  if (true) { \
-  } else      \
+#define DLOG             \
+  if (true) {            \
+    /* NOLINTNEXTLINE */ \
+  } else                 \
     paddle_mobile::ToLog(paddle_mobile::kLOG_DEBUG)
 
 #define LOGF(level, format, ...)
diff --git a/mobile/src/common/types.cpp b/mobile/src/common/types.cpp
index 42a98450a3220bfee9bea4811a9b153ce8ac5b2f..00a4369010248586c9957e9a5d97e22a6d9ab9eb 100755
--- a/mobile/src/common/types.cpp
+++ b/mobile/src/common/types.cpp
@@ -134,6 +134,8 @@ const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE =
     "fill_constant_batch_size_like";
 const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu";
 const char *G_OP_TYPE_PIXEL_SHUFFLE = "pixel_shuffle";
+const char *G_OP_TYPE_EXPAND = "expand";
+const char *G_OP_TYPE_GRID_SAMPLER = "grid_sampler";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -156,7 +158,7 @@ std::unordered_map<
         {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
         {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
         {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
-        {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Y"}}},
         {G_OP_TYPE_FUSION_INSTANCENORM_RELU, {{"X"}, {"Out"}}},
         {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
         {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
@@ -258,5 +260,7 @@ std::unordered_map<
          {{"Ids", "Scores"}, {"SentenceIds", "SentenceScores"}}},
         {G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}}};
+        {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_EXPAND, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_GRID_SAMPLER, {{"X", "Grid"}, {"Output"}}}};
 }  // namespace paddle_mobile
diff --git a/mobile/src/common/types.h b/mobile/src/common/types.h
index d876f3b116cbb397ffa8019b1a8d9a637606ec10..cc49182adb75be6d81d403971d53dca6f0b46627 100644
--- a/mobile/src/common/types.h
+++ b/mobile/src/common/types.h
@@ -265,6 +265,8 @@ extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
 extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU;
 extern const char *G_OP_TYPE_PIXEL_SHUFFLE;
+extern const char *G_OP_TYPE_EXPAND;
+extern const char *G_OP_TYPE_GRID_SAMPLER;
 
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
diff --git a/mobile/src/fpga/V2/bias_scale.cpp b/mobile/src/fpga/V2/bias_scale.cpp
index e04604c587dc6ba1ea29c310a502aba8f8b7b153..44722ef59af7997684a05ad3a6edc2e277af7be4 100644
--- a/mobile/src/fpga/V2/bias_scale.cpp
+++ b/mobile/src/fpga/V2/bias_scale.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "fpga/V2/bias_scale.h"
-#include <memory.h>
 #include <math.h>
+#include <memory.h>
 #include "fpga/common/fpga_common.h"
 
 namespace paddle_mobile {
@@ -56,15 +56,16 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
   *data_in = ptr_aligned;
 }
 
-void fixed_scale_bias_new(void*data_in, int data_len) {
-    int* data_tmp = static_cast<int*>(data_in);
-    for (int idx = 0; idx < data_len/2; ++idx) {
-        float tmp = (static_cast<float*>(data_in))[idx];
-        data_tmp[idx] = static_cast<int>(round(tmp*pow(2.0, 23.0)));
-        tmp = (static_cast<float*>(data_in))[idx+data_len/2];
-        data_tmp[idx+data_len/2] = static_cast<int>(round(tmp*pow(2.0, 30.0)));
-    }
-    return;
+void fixed_scale_bias_new(void *data_in, int data_len) {
+  int *data_tmp = static_cast<int *>(data_in);
+  for (int idx = 0; idx < data_len / 2; ++idx) {
+    float tmp = (static_cast<float *>(data_in))[idx];
+    data_tmp[idx] = static_cast<int>(round(tmp * pow(2.0, 23.0)));
+    tmp = (static_cast<float *>(data_in))[idx + data_len / 2];
+    data_tmp[idx + data_len / 2] =
+        static_cast<int>(round(tmp * pow(2.0, 30.0)));
+  }
+  return;
 }
 
 void interleave(float **data_in, int num_after_alignment) {
diff --git a/mobile/src/fpga/V2/image.cpp b/mobile/src/fpga/V2/image.cpp
index dc3c3356e838c88023d0efa1c40bf6f910aece89..eda7837bd087483a91746260509060e34780696b 100644
--- a/mobile/src/fpga/V2/image.cpp
+++ b/mobile/src/fpga/V2/image.cpp
@@ -83,11 +83,6 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
                     height *
                         align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
                         sizeof(int8_t));
-    for (j = 0;
-         j < height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-         j++) {
-      images_in_tmp[i][j] = (int8_t)(images_in[i][j] * Ck + 0.5);
-    }
   }
   align_each_out_area_cw =
       align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
@@ -99,17 +94,20 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
       for (i = 0; i < image_num; i++) {
         align_each_in_area_cw =
             align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-        memcpy(
-            (int8_t *)image_out + tmp_channel +  // NOLINT
-                k * align_each_out_area_cw_differ,
-            images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw,
-            channel_num[i] * sizeof(int8_t));
+        memcpy((int8_t *)image_out + tmp_channel +  // NOLINT
+                   k * align_each_out_area_cw_differ,
+               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
+               channel_num[i] * sizeof(int8_t));
 
         tmp_channel += channel_num[i];
       }
     }
   }
   fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t));
+  for (i = 0; i < image_num; i++) {
+    fpga_free(images_in_tmp[i]);
+  }
+  fpga_free(images_in_tmp);
 }
 
 void split_image(int8_t *image_in, void **images_out, int image_num,
diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp
old mode 100755
new mode 100644
index 3b8f2d2a71d34041d686dd34df022852b9660f98..585ab6706e0de60212aa87889568d9b4f8e0530c
--- a/mobile/src/fpga/V2/pe.cpp
+++ b/mobile/src/fpga/V2/pe.cpp
@@ -248,8 +248,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
   // DLOG << "   activation_type:" << active_args.activation_type
   //     << "   leaky_relu_negative_slope:"
   //     << active_args.leaky_relu_negative_slope;
-  // DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
+  DLOG << "   reg_ActivationArgs:";
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
     ret = -EIO;
@@ -257,6 +257,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
     pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
     return ret;
   }
+  // reg_writeq(reg_ActivationArgs,
+  // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
+
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
   // new
   reg_writeq((args.driver.row_padding_down << 45) |
                  (args.driver.row_padding_up << 34) |
@@ -270,10 +274,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
                  args.driver.filter_pad_width_mul_channel,
              REG_CONV_REG1);
 
-    reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
-               (args.driver.filter_row << 10) |
-               (args.driver.filter_height << 5) | args.driver.filter_width,
-               REG_CONV_REG2);
+  reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
+                 (args.driver.filter_row << 10) |
+                 (args.driver.filter_height << 5) | args.driver.filter_width,
+             REG_CONV_REG2);
 
   reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
                  (args.driver.prog_full_cnt << 16) |
@@ -365,74 +369,77 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
   uint64_t cmd = 0;
   uint64_t image_physical_address = 0;
   uint64_t output_physical_address = 0;
-
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
   image_physical_address = vaddr_to_paddr(args.image.address);
   output_physical_address = vaddr_to_paddr(args.output.address);
   uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64);
   uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
   uint64_t output_height = (uint64_t)(
       (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h + 1);
+          args.kernel.stride_h +
+      1);
   uint64_t output_width = (uint64_t)(
       (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-           args.kernel.stride_w + 1);
+          args.kernel.stride_w +
+      1);
 
   uint64_t image_amount_per_row =
       align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
                  IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row = (uint64_t)args.image.width *
-          (uint64_t)args.image.channels +(uint64_t)args.image.pad_width *
-          (uint64_t)args.image.channels;
+  uint64_t image_one_pad_per_row =
+      (uint64_t)args.image.width * (uint64_t)args.image.channels +
+      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
 
-  uint64_t result_amount_align_32 = align_to_x((uint64_t)output_width *
-          (uint64_t)args.image.channels, 32);
+  uint64_t result_amount_align_32 =
+      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
   uint64_t result_addr_row =
-          (result_amount_align_32 << 32) | output_physical_address;
+      (result_amount_align_32 << 32) | output_physical_address;
   uint64_t row_padding_down =
-          (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
-  uint64_t kernel_width_sub1 =
-          (uint64_t)args.kernel.width - 1;
+      (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
+  uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
   uint64_t kernel_padding_step = row_padding_down |
-          ((uint64_t)args.image.pad_height << 16) |
-          ((uint64_t)args.kernel.stride_h << 24) |
-          ((uint64_t)kernel_width_sub1<<32) |
-          ((uint64_t)args.kernel.height << 40) |
-          ((uint64_t)(args.kernel.height-1) << 48);
-  uint64_t image_calcu_height = (uint64_t)args.kernel.height +
-          (output_height - 1) * (uint64_t)args.kernel.stride_h;
+                                 ((uint64_t)args.image.pad_height << 16) |
+                                 ((uint64_t)args.kernel.stride_h << 24) |
+                                 ((uint64_t)kernel_width_sub1 << 32) |
+                                 ((uint64_t)args.kernel.height << 40) |
+                                 ((uint64_t)(args.kernel.height - 1) << 48);
+  uint64_t image_calcu_height =
+      (uint64_t)args.kernel.height +
+      (output_height - 1) * (uint64_t)args.kernel.stride_h;
   uint64_t result_size_calcu_height = (output_height - 1) |
-          ((output_width - 1) << 16) | (image_calcu_height << 32);
-  uint64_t col_padding_down = ((uint64_t)args.image.width +
-          (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
+                                      ((output_width - 1) << 16) |
+                                      (image_calcu_height << 32);
+  uint64_t col_padding_down =
+      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
+      (uint64_t)args.image.channels;
 
   uint64_t image_row_col_padding_down =
-          image_amount_per_row | (col_padding_down << 32);
+      image_amount_per_row | (col_padding_down << 32);
   uint64_t image_rowXpadding_h =
-          image_amount_per_row * (uint64_t)args.image.pad_height;
+      image_amount_per_row * (uint64_t)args.image.pad_height;
   uint64_t image_rowXstep_h =
-          image_amount_per_row * (uint64_t)args.kernel.stride_h;
+      image_amount_per_row * (uint64_t)args.kernel.stride_h;
   uint64_t image_rowXpad_h_rowXstep_h =
-          image_rowXpadding_h | (image_rowXstep_h << 32);
+      image_rowXpadding_h | (image_rowXstep_h << 32);
   uint64_t channelXpad_w =
-          (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
+      (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
   uint64_t channelXstep_w =
-          (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
+      (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
   uint64_t channelXpad_w_channelXstep_w =
-          channelXpad_w | (channelXstep_w << 32);
-  uint64_t filter_row_align =
-      C_align_32 * (uint64_t)args.kernel.width;
-  uint64_t sub_filter_amount_align = C_align_32 *
-          (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
+      channelXpad_w | (channelXstep_w << 32);
+  uint64_t filter_row_align = C_align_32 * (uint64_t)args.kernel.width;
+  uint64_t sub_filter_amount_align =
+      C_align_32 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
   uint64_t mult_factor = 0;
   float average_reciprocal = args.kernel_reciprocal;
-  uint32_t* kernel_reciprocal;
-  kernel_reciprocal =(reinterpret_cast<uint32_t*>(&average_reciprocal));
+  uint32_t *kernel_reciprocal;
+  kernel_reciprocal = (reinterpret_cast<uint32_t *>(&average_reciprocal));
   if (args.mode == 1)
-    mult_factor = (uint64_t)(*kernel_reciprocal) |
-            ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
+    mult_factor = (uint64_t)(*kernel_reciprocal) | ((uint64_t)1 << 32) |
+                  ((uint64_t)1 << 40);
   else
     mult_factor =
-            (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
+        (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
     ret = -EIO;
@@ -440,7 +447,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
     pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
     return ret;
   }
-
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
   reg_writeq(image_physical_address, 0x808);
   reg_writeq(result_addr_row, 0x810);
   reg_writeq(kernel_padding_step, 0x818);
@@ -497,7 +504,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #endif
 #ifdef PADDLE_MOBILE_ZU5
   int ret = 0;
-
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
 
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
@@ -507,7 +514,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
     return ret;
   }
 
-
   uint64_t image0_physical_address = 0;
   uint64_t image1_physical_address = 0;
   uint64_t image_physical_address = 0;
@@ -515,32 +521,36 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
   image0_physical_address = vaddr_to_paddr(args.image0.address);
   image1_physical_address = vaddr_to_paddr(args.image1.address);
   image_physical_address =
-          image0_physical_address | (image1_physical_address << 32);
+      image0_physical_address | (image1_physical_address << 32);
   output_physical_address = vaddr_to_paddr(args.output.address);
   uint64_t image_amount_per_row =
-          align_to_x((uint64_t)args.image0.width *
-          (uint64_t)args.image0.channels, IMAGE_ALIGNMENT);
+      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
+                 IMAGE_ALIGNMENT);
   uint64_t result_addr_row =
-          output_physical_address | (image_amount_per_row << 32);
+      output_physical_address | (image_amount_per_row << 32);
   uint64_t kernel_padding_step = 0;
   kernel_padding_step = ((uint64_t)args.image0.height * 2) |
-          ((uint64_t)2 << 24) | ((uint64_t)2 << 40) | ((uint64_t)1 << 48);
-  uint64_t result_size_calcu_height = ((uint64_t)args.image0.height - 1) |
-          ((image_amount_per_row / 32 - 1) << 16) |
-          (((uint64_t)args.image0.height * 2) << 32);
-  uint64_t image_row_col_padding_down = image_amount_per_row |
-          (image_amount_per_row << 32);
-  float  quantParam = (args.output.scale_address)[0];
-  uint32_t* ew_scale = reinterpret_cast<uint32_t*>(&quantParam);
-  uint64_t ew_scale_mult_factor = (*ew_scale) |
-          ((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40);
+                        ((uint64_t)2 << 24) | ((uint64_t)2 << 40) |
+                        ((uint64_t)1 << 48);
+  uint64_t result_size_calcu_height =
+      ((uint64_t)args.image0.height - 1) |
+      ((image_amount_per_row / 32 - 1) << 16) |
+      (((uint64_t)args.image0.height * 2) << 32);
+  uint64_t image_row_col_padding_down =
+      image_amount_per_row | (image_amount_per_row << 32);
+  float quantParam =
+      ((args.image0.scale_address)[0]) / ((args.output.scale_address)[0]);
+  uint32_t *ew_scale = reinterpret_cast<uint32_t *>(&quantParam);
+  uint64_t ew_scale_mult_factor = (*ew_scale) | ((uint64_t)args.const0 << 32) |
+                                  ((uint64_t)args.const1 << 40);
+  reg_writeq(0ul, REG_SCALE_PARAMETER);
   reg_writeq(image_physical_address, 0x808);
   reg_writeq(result_addr_row, 0x810);
   reg_writeq(kernel_padding_step, 0x818);
   reg_writeq(result_size_calcu_height, 0x820);
   reg_writeq(32, 0x828);
   reg_writeq(image_row_col_padding_down, 0x830);
-  reg_writeq(((image_amount_per_row*2) << 32), 0x838);
+  reg_writeq(((image_amount_per_row * 2) << 32), 0x838);
   reg_writeq(ew_scale_mult_factor, 0x840);  // dw donot care
   reg_writeq(((uint64_t)32 << 32), 0x848);
   reg_writeq(0, 0x858);
@@ -918,7 +928,7 @@ int ComputeDWConv(const struct DWconvArgs &args) {
        << "   pad_height:" << args.image.pad_height
        << "   pad_width:" << args.image.pad_width;
   DLOG << "   filter_address:" << args.filter_address;
-       //<< "   bias_address:" << args.bias_address;
+  //<< "   bias_address:" << args.bias_address;
   DLOG << "   kernel_height:" << args.kernel.height
        << "   kernel_width:" << args.kernel.width
        << "   stride_h:" << args.kernel.stride_h
@@ -928,6 +938,7 @@ int ComputeDWConv(const struct DWconvArgs &args) {
 #endif
 #ifdef PADDLE_MOBILE_ZU5
   DLOG << "DWConv";
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
   // return 0;
   uint64_t timer_cnt = 0;
   int ret = 0;
@@ -943,67 +954,71 @@ int ComputeDWConv(const struct DWconvArgs &args) {
   bias_physical_address = vaddr_to_paddr(args.bias_address);
   uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64);
   uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
-  uint64_t output_height = (uint64_t)
-          ((args.image.height + args.image.pad_height * 2 -
-          args.kernel.height) / args.kernel.stride_h +1);
-  uint64_t output_width = (uint64_t)
-          (((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w + 1) * args.sub_conv_num);
+  uint64_t output_height = (uint64_t)(
+      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
+          args.kernel.stride_h +
+      1);
+  uint64_t output_width = (uint64_t)(
+      ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+           args.kernel.stride_w +
+       1) *
+      args.sub_conv_num);
 
   uint64_t image_amount_per_row =
-          align_to_x((uint64_t)args.image.width *
-          (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
+      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
+                 IMAGE_ALIGNMENT);
   uint64_t image_one_pad_per_row =
-          (uint64_t)args.image.width * (uint64_t)args.image.channels +
-          (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
+      (uint64_t)args.image.width * (uint64_t)args.image.channels +
+      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
 
-  uint64_t result_amount_align_32 = align_to_x(
-          (uint64_t)output_width * (uint64_t)args.image.channels, 32);
+  uint64_t result_amount_align_32 =
+      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
   uint64_t result_addr_row =
-          (result_amount_align_32 << 32) | output_physical_address;
+      (result_amount_align_32 << 32) | output_physical_address;
   uint64_t row_padding_down =
-          (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
+      (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
   uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
   uint64_t kernel_padding_step = row_padding_down |
-          ((uint64_t)args.image.pad_height << 16) |
-          ((uint64_t)args.kernel.stride_h << 24) |
-          ((uint64_t)kernel_width_sub1<<32) |
-          ((uint64_t)args.kernel.height << 40) |
-          ((uint64_t)(args.kernel.height-1) << 48);
-  uint64_t image_calcu_height = (uint64_t)args.kernel.height +
-          (output_height - 1) * (uint64_t)args.kernel.stride_h;
+                                 ((uint64_t)args.image.pad_height << 16) |
+                                 ((uint64_t)args.kernel.stride_h << 24) |
+                                 ((uint64_t)kernel_width_sub1 << 32) |
+                                 ((uint64_t)args.kernel.height << 40) |
+                                 ((uint64_t)(args.kernel.height - 1) << 48);
+  uint64_t image_calcu_height =
+      (uint64_t)args.kernel.height +
+      (output_height - 1) * (uint64_t)args.kernel.stride_h;
   uint64_t result_size_calcu_height = (output_height - 1) |
-          ((output_width - 1) << 16) | (image_calcu_height << 32);
-  uint64_t col_padding_down = ((uint64_t)args.image.width +
-          (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
+                                      ((output_width - 1) << 16) |
+                                      (image_calcu_height << 32);
+  uint64_t col_padding_down =
+      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
+      (uint64_t)args.image.channels;
 
   uint64_t image_row_col_padding_down =
-          image_amount_per_row | (col_padding_down << 32);
+      image_amount_per_row | (col_padding_down << 32);
   uint64_t image_rowXpadding_h =
-          image_amount_per_row * (uint64_t)args.image.pad_height;
+      image_amount_per_row * (uint64_t)args.image.pad_height;
   uint64_t image_rowXstep_h =
-          image_amount_per_row * (uint64_t)args.kernel.stride_h;
+      image_amount_per_row * (uint64_t)args.kernel.stride_h;
   uint64_t image_rowXpad_h_rowXstep_h =
-          image_rowXpadding_h | (image_rowXstep_h << 32);
+      image_rowXpadding_h | (image_rowXstep_h << 32);
   uint64_t channelXpad_w =
-          (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
+      (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
   uint64_t channelXstep_w =
-          (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
+      (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
   uint64_t channelXpad_w_channelXstep_w =
-          channelXpad_w | (channelXstep_w << 32);
+      channelXpad_w | (channelXstep_w << 32);
 
-  uint64_t filter_row_align =
-          C_align_64 * (uint64_t)args.kernel.width;
-  uint64_t sub_filter_amount_align = C_align_64 *
-          (uint64_t)args.kernel.width *
-          (uint64_t)args.kernel.height;
+  uint64_t filter_row_align = C_align_64 * (uint64_t)args.kernel.width;
+  uint64_t sub_filter_amount_align =
+      C_align_64 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
   uint64_t filter_amount_align =
-          sub_filter_amount_align * (uint64_t)args.sub_conv_num;
+      sub_filter_amount_align * (uint64_t)args.sub_conv_num;
   uint64_t filter_param = filter_row_align | (filter_amount_align << 16) |
-          (sub_filter_amount_align << 32) |
-          (((uint64_t)args.sub_conv_num -1) << 48);
+                          (sub_filter_amount_align << 32) |
+                          (((uint64_t)args.sub_conv_num - 1) << 48);
   uint64_t channel_parameter =
-          (uint64_t)args.image.channels | (C_align_64 << 16);
+      (uint64_t)args.image.channels | (C_align_64 << 16);
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
     ret = -EIO;
@@ -1011,7 +1026,7 @@ int ComputeDWConv(const struct DWconvArgs &args) {
     pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
     return ret;
   }
-
+  reg_writeq(0ul, REG_SCALE_PARAMETER);
   reg_writeq(image_physical_address, 0x808);
   reg_writeq(result_addr_row, 0x810);
   reg_writeq(kernel_padding_step, 0x818);
@@ -1023,8 +1038,9 @@ int ComputeDWConv(const struct DWconvArgs &args) {
   reg_writeq(channelXpad_w_channelXstep_w, 0x848);
   reg_writeq(filter_physical_address, 0x850);
   reg_writeq(filter_param, 0x858);
-  reg_writeq(((bias_physical_address+C_align_64*4) |
-  (bias_physical_address << 32)), 0x860);
+  reg_writeq(((bias_physical_address + C_align_64 * 4) |
+              (bias_physical_address << 32)),
+             0x860);
   cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8);
   reg_writeq(cmd, 0x800);
 
diff --git a/mobile/src/fpga/common/driver.cpp b/mobile/src/fpga/common/driver.cpp
old mode 100644
new mode 100755
index 911704965aac3b6897b70dc60cb23fb4f3e59979..b7ce4d32474465988f0e2c02763d21bfdf9a7530
--- a/mobile/src/fpga/common/driver.cpp
+++ b/mobile/src/fpga/common/driver.cpp
@@ -134,9 +134,9 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
   uint64_t i = 0;
   /*timeout精确性待确认*/
   int64_t timeout = time * 6;
-  usleep(1);
 
   for (i = 0; i < timeout; i++) {
+    usleep(1);
     if (val == reg_readq(reg)) {
       break;
     }
diff --git a/mobile/src/fpga/common/fpga_common.h b/mobile/src/fpga/common/fpga_common.h
old mode 100644
new mode 100755
index a798d54459b86f67a28c158dc30c82131ea48626..a767cd2606bb351b42f8d2a6bc944c66a2fa39a7
--- a/mobile/src/fpga/common/fpga_common.h
+++ b/mobile/src/fpga/common/fpga_common.h
@@ -211,6 +211,7 @@ struct ConcatArgs {
   uint32_t out_channel;
   uint32_t height;
   uint32_t width;
+  std::vector<std::shared_ptr<char>> vector_concat_space;
 };
 
 struct SplitConvArgs {
diff --git a/mobile/src/framework/cl/cl_deleter.h b/mobile/src/framework/cl/cl_deleter.h
index 55af631174ae9f2a7815c2da35ebadda3ebfd9e9..731e5de663cd7af63a5a981dfb1d46f82101d6b8 100644
--- a/mobile/src/framework/cl/cl_deleter.h
+++ b/mobile/src/framework/cl/cl_deleter.h
@@ -15,45 +15,51 @@ limitations under the License. */
 #pragma once
 
 #include "CL/cl.h"
-
+#include "common/log.h"
 struct CLKernelDeleter {
   template <class T>
   void operator()(T *clKernelObj) {
-    clReleaseKernel(clKernelObj);
+    const cl_int status = clReleaseKernel(clKernelObj);
+    LOG(paddle_mobile::kNO_LOG) << "clReleaseKernel  status:     " << status;
   }
 };
 
 struct CLMemDeleter {
   template <class T>
   void operator()(T *clMemObj) {
-    clReleaseMemObject(clMemObj);
+    const cl_int status = clReleaseMemObject(clMemObj);
+    LOG(paddle_mobile::kNO_LOG) << "CLMemDeleter  status:     " << status;
   }
 };
 
 struct CLEventDeleter {
   template <class T>
   void operator()(T *clEventObj) {
-    clReleaseEvent(clEventObj);
+    const cl_int status = clReleaseEvent(clEventObj);
+    LOG(paddle_mobile::kNO_LOG) << "CLEventDeleter  status:     " << status;
   }
 };
 
 struct CLCommQueueDeleter {
   template <class T>
   void operator()(T *clQueueObj) {
-    clReleaseCommandQueue(clQueueObj);
+    const cl_int status = clReleaseCommandQueue(clQueueObj);
+    LOG(paddle_mobile::kNO_LOG) << "CLCommQueueDeleter  status:     " << status;
   }
 };
 
 struct CLContextDeleter {
   template <class T>
   void operator()(T *clContextObj) {
-    clReleaseContext(clContextObj);
+    const cl_int status = clReleaseContext(clContextObj);
+    LOG(paddle_mobile::kNO_LOG) << "CLContextDeleter  status:     " << status;
   }
 };
 
 struct CLProgramDeleter {
   template <class T>
   void operator()(T *clProgramObj) {
-    clReleaseProgram(clProgramObj);
+    const cl_int status = clReleaseProgram(clProgramObj);
+    LOG(paddle_mobile::kNO_LOG) << "CLProgramDeleter  status:   " << status;
   }
 };
diff --git a/mobile/src/framework/cl/cl_engine.cpp b/mobile/src/framework/cl/cl_engine.cpp
index c39ae00b00c0b240b15b7d98805d097b2af50ef4..e8a8361eac71083d126b9ca4c22a098c6a9192fe 100644
--- a/mobile/src/framework/cl/cl_engine.cpp
+++ b/mobile/src/framework/cl/cl_engine.cpp
@@ -23,9 +23,11 @@ namespace paddle_mobile {
 namespace framework {
 
 bool CLEngine::Init() {
+  LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init()";
   if (initialized_) {
     return true;
   }
+  LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init() ...";
   cl_int status;
   bool is_setplatform_success = SetPlatform();
   bool is_setcldeviceid_success = SetClDeviceId();
@@ -53,12 +55,14 @@ bool CLEngine::SetPlatform() {
     return false;
   }
   /**For clarity, choose the first available platform. */
+  LOG(paddle_mobile::kNO_LOG) << "numPlatforms: " << numPlatforms;
   if (numPlatforms > 0) {
     cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
         malloc(numPlatforms * sizeof(cl_platform_id)));
     status = clGetPlatformIDs(numPlatforms, platforms, NULL);
     platform_ = platforms[0];
     free(platforms);
+    LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_;
     return status == CL_SUCCESS;
   }
 
@@ -67,70 +71,21 @@ bool CLEngine::SetPlatform() {
 
 bool CLEngine::SetClDeviceId() {
   cl_uint numDevices = 0;
-  devices_ = NULL;
+  LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_;
   cl_int status =
       clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
   if (status != CL_SUCCESS) {
     return false;
   }
+  LOG(paddle_mobile::kNO_LOG) << "numDevices: " << numDevices;
+
   if (numDevices > 0) {
-    devices_ = reinterpret_cast<cl_device_id *>(
-        malloc(numDevices * sizeof(cl_device_id)));
     status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_,
                             NULL);
+    LOG(paddle_mobile::kNO_LOG) << "devices_[0]" << devices_[0];
     return status == CL_SUCCESS;
   }
   return false;
 }
-
-// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel(
-//    const std::string &kernel_name) {
-//  std::unique_ptr<_cl_kernel, clKernel_deleter> kernel(
-//      clCreateKernel(program_.get(), kernel_name.c_str(), NULL));
-//  return std::move(kernel);
-//}
-//
-// bool CLEngine::SetClCommandQueue() {
-//  cl_int status;
-//  command_queue_.reset(
-//          clCreateCommandQueue(context_.get(), devices_[0], 0, &status));
-//  return true;
-//}
-
-// bool CLEngine::SetClContext() {
-//  context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL));
-//  return true;
-//}
-
-// bool CLEngine::LoadKernelFromFile(const char *kernel_file) {
-//  size_t size;
-//  char *str;
-//  std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary));
-//
-//  if (!f.is_open()) {
-//    return false;
-//  }
-//
-//  size_t fileSize;
-//  f.seekg(0, std::fstream::end);
-//  size = fileSize = (size_t)f.tellg();
-//  f.seekg(0, std::fstream::beg);
-//  str = new char[size + 1];
-//  if (!str) {
-//    f.close();
-//    return 0;
-//  }
-//
-//  f.read(str, fileSize);
-//  f.close();
-//  str[size] = '\0';
-//  const char *source = str;
-//  size_t sourceSize[] = {strlen(source)};
-//  program_.reset(
-//      clCreateProgramWithSource(context_.get(), 1, &source, sourceSize,
-//      NULL));
-//  return true;
-//}
-
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_engine.h b/mobile/src/framework/cl/cl_engine.h
index 2e21dd9e395354d2bd5e35a648687a6116347caf..2a6362ebc06c1e99a0e26502b4da0883732c9112 100644
--- a/mobile/src/framework/cl/cl_engine.h
+++ b/mobile/src/framework/cl/cl_engine.h
@@ -57,19 +57,27 @@ class CLLocalWorkSizeInfo {
   // max number of work-items in local_work_size in dim 2
   size_t max_work_item_size2;
 };
-
+inline void ctx_info(const char *errinfo, const void *private_info, size_t cb,
+                     void *user_data) {
+  fprintf(stderr, "OpenCL Error (via pfn_notify): %s\n", errinfo);
+}
 class CLEngine {
  public:
   static CLEngine *Instance();
 
   bool Init();
   bool isInitSuccess();
-  std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() {
+
+  std::shared_ptr<_cl_context> CreateContext() {
+    DLOG << "CreateContext ---";
+    DLOG << "platform: " << platform_;
+    DLOG << "devices_[0]: " << devices_[0];
+
     cl_int status;
-    cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status);
-    std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c);
+    cl_context c = clCreateContext(NULL, 1, devices_, &ctx_info, NULL, &status);
+    std::shared_ptr<_cl_context> context(c, CLContextDeleter());
     CL_CHECK_ERRORS(status);
-    return std::move(context_ptr);
+    return std::move(context);
   }
 
   std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
@@ -84,14 +92,14 @@ class CLEngine {
   }
 
   cl_context getContext() {
-    if (context_ == nullptr) {
+    if (context_.get() == nullptr) {
       context_ = CreateContext();
     }
     return context_.get();
   }
 
   cl_command_queue getClCommandQueue() {
-    if (command_queue_ == nullptr) {
+    if (command_queue_.get() == nullptr) {
       command_queue_ = CreateClCommandQueue(getContext());
     }
     return command_queue_.get();
@@ -124,9 +132,9 @@ class CLEngine {
     if (status != CL_SUCCESS || ret_size / sizeof(size_t) < 3) {
       return CLLocalWorkSizeInfo(0, 0, 0, 0);
     }
-    DLOG << max_work_item_sizes[0];
-    DLOG << max_work_item_sizes[1];
-    DLOG << max_work_item_sizes[2];
+    DLOG << " max_work_item_sizes {" << max_work_item_sizes[0] << ", "
+         << max_work_item_sizes[1] << ", " << max_work_item_sizes[2] << "}";
+
     localWorkSizeInfo_ =
         CLLocalWorkSizeInfo(max_work_group_size, max_work_item_sizes[0],
                             max_work_item_sizes[1], max_work_item_sizes[2]);
@@ -182,8 +190,8 @@ class CLEngine {
     cl_program p =
         clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
 
-    DLOG << " cl kernel from source";
-    DLOG << " source size: " << sourceSize[0];
+    LOG(kLOG_DEBUG4) << " cl kernel from source";
+    LOG(kLOG_DEBUG4) << " source size: " << sourceSize[0];
     CL_CHECK_ERRORS(status_);
 
     std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
@@ -216,11 +224,7 @@ class CLEngine {
       DLOG << " program build error: " << log;
     }
 
-    if (status == CL_SUCCESS) {
-      return true;
-    } else {
-      return false;
-    }
+    return status == CL_SUCCESS;
   }
 
   cl_device_id DeviceID(int index = 0) { return devices_[index]; }
@@ -239,28 +243,13 @@ class CLEngine {
 
   CLLocalWorkSizeInfo localWorkSizeInfo_;
 
-  cl_platform_id platform_;
-
-  cl_device_id *devices_;
-
   cl_int status_;
-
   std::string cl_path_;
-  std::unique_ptr<_cl_program, CLProgramDeleter> program_;
-
-  std::unique_ptr<_cl_context, CLContextDeleter> context_ = nullptr;
-
-  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ =
-      nullptr;
-
-  //  bool SetClContext();
-
-  //  bool SetClCommandQueue();
-
-  //  bool LoadKernelFromFile(const char *kernel_file);
-
-  //  bool BuildProgram();
   bool is_init_success_ = false;
+  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_;
+  std::shared_ptr<_cl_context> context_;
+  cl_device_id devices_[10];
+  cl_platform_id platform_;
 };
 
 }  // namespace framework
diff --git a/mobile/src/framework/cl/cl_helper.h b/mobile/src/framework/cl/cl_helper.h
index 893456211d0429701b49d0f0be654beaad16e0e2..db9aa37ae2b7219131b5950e54ec008828f1fc70 100644
--- a/mobile/src/framework/cl/cl_helper.h
+++ b/mobile/src/framework/cl/cl_helper.h
@@ -36,9 +36,9 @@ class CLHelper {
 
   void AddKernel(const std::string &kernel_name, const std::string &file_name,
                  const std::string &options = "") {
-    DLOG << " begin add kernel ";
+    LOG(kLOG_DEBUG1) << " begin add kernel ";
     auto kernel = scope_->GetKernel(kernel_name, file_name, options);
-    DLOG << " add kernel ing ";
+    LOG(kLOG_DEBUG1) << " begin add kernel ";
     kernels.emplace_back(std::move(kernel));
   }
 
diff --git a/mobile/src/framework/cl/cl_image.cpp b/mobile/src/framework/cl/cl_image.cpp
index 0d4cf87db0d34953936d107b6bb6c9adbd985560..1b8966742d77db8c63d89ab4ca8176494ba7cab0 100644
--- a/mobile/src/framework/cl/cl_image.cpp
+++ b/mobile/src/framework/cl/cl_image.cpp
@@ -18,6 +18,37 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {
 
+void CLImage::PrintTensor(const CLImage &cl_image) const {
+  size_t width = cl_image.ImageDims()[0];
+  size_t height = cl_image.ImageDims()[1];
+
+  half_t *image_data = new half_t[height * width * 4];
+  cl_int err;
+  cl_mem image = cl_image.GetCLImage();
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {width, height, 1};
+  err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
+                           region, 0, 0, image_data, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(err);
+
+  PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0,
+                        "cl_image numel should not be 0 ");
+  float *tensor_data = new float[cl_image.numel()];
+  auto converter = cl_image.Converter();
+  converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
+                         cl_image.dims());
+  int stride = cl_image.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+
+  for (int i = 0; i < cl_image.numel(); i++) {
+    printf("%f \n", tensor_data[i]);
+  }
+
+  delete[](tensor_data);
+  delete[](image_data);
+}
+
 void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
                      cl_command_queue commandQueue, cl_kernel kernel) {
   tensor->mutable_data<float>();
diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h
index 6e885adca886b62099946590d52941d8de2550f0..57656c3c6d995f9e9c2b5bb8e921b44310d3bbd5 100644
--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <iostream>
 #include <memory>
 #include <vector>
 
@@ -86,14 +87,14 @@ class CLImage {
     PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
                           " need call SetTensorData first");
 
-    DLOG << " begin init cl image ";
+    LOG(kNO_LOG) << " begin init cl image ";
     image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
 
     half_t *image_data = new half_t[product(image_dims_) * 4];
 
-    DLOG << " convert to image";
+    LOG(kNO_LOG) << " convert to image";
     converter->NCHWToImage(tensor_data_, image_data, tensor_dims_);
-    DLOG << " end convert to image";
+    LOG(kNO_LOG) << " end convert to image";
 
     InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
 
@@ -104,7 +105,7 @@ class CLImage {
     tensor_data_ = nullptr;
     image_converter_ = converter;
     initialized_ = true;
-    DLOG << " end init cl image";
+    LOG(kNO_LOG) << " end init cl image";
   }
 
   void InitNImage(cl_context context, cl_command_queue command_queue) {
@@ -136,9 +137,9 @@ class CLImage {
     //    CLImageConverterFolder();
     CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
     PADDLE_MOBILE_ENFORCE(!shared_mem_, "do not init mem after shared .")
-    DLOG << " to get image dims ";
+    //    LOG(kNO_LOG) << " to get image dims ";
     image_dims_ = normal_converter->InitImageDimInfoWith(dim);
-    DLOG << " end get image dims " << image_dims_;
+    //    LOG(kNO_LOG) << " end get image dims " << image_dims_;
 
     InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
 
@@ -147,7 +148,7 @@ class CLImage {
     image_converter_ = normal_converter;
     cl_event_ = CLEngine::Instance()->CreateEvent(context);
     initialized_ = true;
-    DLOG << " end init cl image";
+    //    LOG(kNO_LOG) << " end init cl image";
   }
   /**
    *  create fake size cl_mem for mem share
@@ -162,12 +163,15 @@ class CLImage {
     CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
     // use real image dims to create mem
     real_image_dims_ = real_image_dims;
+    // when init fake size image ,
+    // reinit image is allow , it is disallowed after this..
+    shared_mem_ = false;
     InitCLImage(context, real_image_dims_[0], real_image_dims_[1], nullptr);
     // cheat cl_image they got what they wanted
     image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
-    DLOG << "InitFakeSizeImage ... ";
-    DLOG << "real_image_dims:  " << real_image_dims_;
-    DLOG << "image_dims_:  " << image_dims_;
+    LOG(kNO_LOG) << "InitFakeSizeImage ... ";
+    LOG(kNO_LOG) << "real_image_dims:  " << real_image_dims_;
+    LOG(kNO_LOG) << "image_dims_:  " << image_dims_;
     PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] &&
                               real_image_dims_[1] >= image_dims_[1],
                           "real image is not enough");
@@ -178,7 +182,7 @@ class CLImage {
     initialized_ = true;
     shared_mem_ = true;
 
-    DLOG << " end init FakeSizeImage";
+    LOG(kNO_LOG) << " end init FakeSizeImage";
   }
   /**
    * init cl mem with a exist cl mem
@@ -193,21 +197,21 @@ class CLImage {
     real_image_dims_ = src.real_image_dims_;
     image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
 
-    DLOG << "InitWithExistMem ... ";
-    DLOG << "real_image_dims:  " << real_image_dims_;
-    DLOG << "image_dims_:  " << image_dims_;
+    LOG(kNO_LOG) << "InitWithExistMem ... ";
+    LOG(kNO_LOG) << "real_image_dims:  " << real_image_dims_;
+    LOG(kNO_LOG) << "image_dims_:  " << image_dims_;
 
     if (real_image_dims_[0] < image_dims_[0] ||
         real_image_dims_[1] < image_dims_[1]) {
-      DLOG << "real image is not enough!";
-      DLOG << "real_image_dims:  " << real_image_dims_;
-      DLOG << "image_dims_:  " << image_dims_;
+      LOG(kNO_LOG) << "real image is not enough!";
+      LOG(kNO_LOG) << "real_image_dims:  " << real_image_dims_;
+      LOG(kNO_LOG) << "image_dims_:  " << image_dims_;
     }
     PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] &&
                               real_image_dims_[1] >= image_dims_[1],
                           "real image is not enough!");
     if (cl_image_ != src.cl_image_) {
-      cl_image_.reset(src.cl_image_.get(), CLMemDeleter());
+      cl_image_ = src.cl_image_;
     }
 
     tensor_dims_ = need_dims;
@@ -217,7 +221,7 @@ class CLImage {
     initialized_ = true;
     shared_mem_ = true;
 
-    DLOG << " end init WithExistMem";
+    LOG(kNO_LOG) << " end init WithExistMem";
   }
 
   void InitConv2dTransposeFilterCLImage(cl_context context,
@@ -229,18 +233,6 @@ class CLImage {
     InitCLImage(context, command_queue, converter);
   }
 
-  /*! The internal of two tensors share the same memory block. */
-  inline CLImage &ShareHolderWith(const CLImage &src) {
-    PADDLE_MOBILE_ENFORCE(
-        src.cl_image_ != nullptr,
-        "Tensor holds no memory. Call Tensor::mutable_data first.")
-
-    if (cl_image_ != src.cl_image_) {
-      cl_image_.reset(src.cl_image_.get(), CLMemDeleter());
-    }
-    return *this;
-  }
-
   cl_mem GetCLImage() const { return cl_image_.get(); }
 
   const DDim &ImageDims() const { return image_dims_; }
@@ -282,6 +274,7 @@ class CLImage {
   cl_event GetClEvent() const { return cl_event_.get(); }
 
   CLImageConverterBase *Converter() const { return image_converter_; }
+  void PrintTensor(const CLImage &cl_image) const;
 
  private:
   void InitCLImage(cl_context context, size_t width, size_t height,
diff --git a/mobile/src/framework/cl/cl_scope.h b/mobile/src/framework/cl/cl_scope.h
index 643ce32b57616305da0c581d6d50dfcbbc4f1b1d..49e705e5a0a7f401954bca9719bfdad4c7065081 100644
--- a/mobile/src/framework/cl/cl_scope.h
+++ b/mobile/src/framework/cl/cl_scope.h
@@ -35,30 +35,27 @@ namespace framework {
 
 class CLScope {
  public:
-  CLScope() {
-    CLEngine *engine = CLEngine::Instance();
-    context_ = engine->getContext();
-    command_queue_ = engine->getClCommandQueue();
-    localWorkSizeInfo_ = engine->getLocalWorkSizeInfo();
-  }
+  CLScope() {}
 
-  cl_command_queue CommandQueue() { return command_queue_; }
+  cl_command_queue CommandQueue() {
+    return CLEngine::Instance()->getClCommandQueue();
+  }
 
   std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
       const std::string &kernel_name, const std::string &file_name,
       const std::string &options) {
-    DLOG << " to get program " << file_name;
+    LOG(kLOG_DEBUG2) << " to get program " << file_name;
     auto program = Program(file_name, kernel_name, options);
-    DLOG << " end get program ~ ";
-    DLOG << " to create kernel: " << kernel_name;
+    LOG(kLOG_DEBUG2) << " end get program ~ ";
+    LOG(kLOG_DEBUG2) << " to create kernel: " << kernel_name;
     std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
         clCreateKernel(program, kernel_name.c_str(), &status_));
     CL_CHECK_ERRORS(status_);
-    DLOG << " end create kernel ~ ";
+    LOG(kLOG_DEBUG2) << " end create kernel ~ ";
     return std::move(kernel);
   }
 
-  cl_context Context() { return context_; }
+  cl_context Context() { return CLEngine::Instance()->getContext(); }
 
   cl_program Program(const std::string &file_name,
                      const std::string &kernel_name,
@@ -79,11 +76,13 @@ class CLScope {
       std::string header(header_it->second.begin(), header_it->second.end());
       source = header + "\n" + source;
       auto program = CLEngine::Instance()->CreateProgramWithSource(
-          context_, source.c_str());
+          CLEngine::Instance()->getContext(), source.c_str());
 
-      DLOG << " --- begin build program -> " << program_key << " --- ";
+      LOG(kLOG_DEBUG3) << " --- begin build program -> " << program_key
+                       << " --- ";
       CLEngine::Instance()->BuildProgram(program.get(), options);
-      DLOG << " --- end build program -> " << program_key << " --- ";
+      LOG(kLOG_DEBUG3) << " --- end build program -> " << program_key
+                       << " --- ";
 
       programs_[program_key] = std::move(program);
       return programs_[program_key].get();
@@ -97,19 +96,23 @@ class CLScope {
         return it->second.get();
       }
       auto program = CLEngine::Instance()->CreateProgramWith(
-          context_,
+          CLEngine::Instance()->getContext(),
           CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
 
-      DLOG << " --- begin build program -> " << program_key << " --- ";
+      LOG(kLOG_DEBUG3) << " --- begin build program ele-> " << program_key
+                       << " --- ";
       CLEngine::Instance()->BuildProgram(program.get(), options);
-      DLOG << " --- end build program -> " << program_key << " --- ";
+      LOG(kLOG_DEBUG3) << " --- end build program ele-> " << program_key
+                       << " --- ";
 
       programs_[program_key] = std::move(program);
       return programs_[program_key].get();
     }
   }
 
-  CLLocalWorkSizeInfo LocalWorkSizeInfo() { return localWorkSizeInfo_; }
+  CLLocalWorkSizeInfo LocalWorkSizeInfo() {
+    return CLEngine::Instance()->getLocalWorkSizeInfo();
+  }
   size_t KernelWorkSize(cl_kernel kernel) {
     size_t kernel_work_size = CLEngine::Instance()->GetKernelWorkSize(kernel);
     return kernel_work_size;
@@ -117,12 +120,9 @@ class CLScope {
 
  private:
   cl_int status_;
-  cl_context context_;
-  cl_command_queue command_queue_;
   std::unordered_map<std::string,
                      std::unique_ptr<_cl_program, CLProgramDeleter>>
       programs_;
-  CLLocalWorkSizeInfo localWorkSizeInfo_;
 };
 
 }  // namespace framework
diff --git a/mobile/src/framework/cl/cl_tool.h b/mobile/src/framework/cl/cl_tool.h
index 25d5bfc584b59e4fe9d22a922b601f8c32892fd1..ccc97779ece91b881312b031a92a6992ba5fed86 100644
--- a/mobile/src/framework/cl/cl_tool.h
+++ b/mobile/src/framework/cl/cl_tool.h
@@ -21,13 +21,14 @@ namespace framework {
 
 const char* opencl_error_to_str(cl_int error);
 
-#define CL_CHECK_ERRORS(ERR)                                          \
-  if (ERR != CL_SUCCESS) {                                            \
-    printf(                                                           \
-        "OpenCL error with code %s happened in file %s at line %d. "  \
-        "Exiting.\n",                                                 \
-        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
-        __LINE__);                                                    \
+#define CL_CHECK_ERRORS(ERR)                                                  \
+  if (ERR != CL_SUCCESS) {                                                    \
+    printf(                                                                   \
+        "\033[1;31;40mOpenCL error with code %s happened in file %s at line " \
+        "%d. "                                                                \
+        "Exiting.\033[0m\n",                                                  \
+        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__,         \
+        __LINE__);                                                            \
   }
 
 }  // namespace framework
diff --git a/mobile/src/framework/context.h b/mobile/src/framework/context.h
index 944d54cc499f2a3c4fcad5c2fb0dfc4fe9bcac1d..18e40311bc2a5d555bb02cf0eb7af6356cbbf0b0 100644
--- a/mobile/src/framework/context.h
+++ b/mobile/src/framework/context.h
@@ -44,15 +44,13 @@ namespace framework {
 struct CPUContext {
  private:
   CPUContext();
-  virtual ~CPUContext() {}
 
  public:
+  ~CPUContext() {}
+
   static CPUContext* Context() {
-    static CPUContext* ctx = nullptr;
-    if (ctx == nullptr) {
-      ctx = new CPUContext();
-    }
-    return ctx;
+    static CPUContext ctx;
+    return &ctx;
   }
 
   void set_thread_num(int thread_num,
diff --git a/mobile/src/framework/executor.cpp b/mobile/src/framework/executor.cpp
index 743dea76aef58a582810a50c3f646a8875d1cacc..cda5c5522c961c70fc15bf76fcd650a17bb76835 100644
--- a/mobile/src/framework/executor.cpp
+++ b/mobile/src/framework/executor.cpp
@@ -80,7 +80,7 @@ Executor<Device, T>::Executor(const Program<Device> &program,
   std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
   for (int j = 0; j < ops.size(); ++j) {
     std::shared_ptr<OpDesc> op_desc = ops[j];
-    DLOG << "create op: " << op_desc->Type();
+    LOG(kLOG_INFO) << "create op[" << j << "]: " << op_desc->Type();
 
     auto op_handler = OpRegistry<Device>::CreateOp(
         op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
@@ -111,7 +111,8 @@ Executor<Device, T>::Executor(const Program<Device> &program,
     clock_gettime(CLOCK_MONOTONIC, &ts);
     profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
-    DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
+    LOG(kLOG_INFO) << "Initialize op[" << count++
+                   << "]: " << op_handler->Type();
     if (op_handler->Type() == "feed" || op_handler->Type() == "fetch") {
       op_handler->setPrePostType(config_.pre_post_type);
     }
@@ -363,7 +364,10 @@ void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
         DLOG << "InitNoPersistableMemory var " << var_desc->Name();
         auto tensor = var->template GetMutable<LoDTensor>();
         if (tensor->IsInitialized() && tensor->dims().size() == 4) {
-          DLOG << "var's tensor is Initialized or dims size != 4";
+          // don't change user's input and avoid memory leaks
+          if (feed_indices_.find(var_desc->Name()) != feed_indices_.end()) {
+            break;
+          }
           DDim tensor_dim = tensor->dims();
           DDim new_dim =
               make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
@@ -554,8 +558,8 @@ PMStatus Executor<Device, T>::Predict() {
       clock_gettime(CLOCK_MONOTONIC, &ts);
       profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
-      DLOG << i << "th, "
-           << "run op: " << op_handler->Type();
+      LOG(paddle_mobile::kLOG_INFO) << i << "th, "
+                                    << "run op: " << op_handler->Type();
       if (lod_mode_ && input_dim_has_changed_) {
         op_handler->InferShape();
       }
@@ -1012,7 +1016,7 @@ void Executor<GPU_CL, float>::InitMemory() {
           const TensorDesc &desc = var_desc->Tensor_desc();
           //          DDim ddim = make_ddim(desc.Dims());
           DDim ddim = cl_image->dims();
-          DLOG << var_desc->Name();
+          LOG(kLOG_DEBUG1) << "init image of " << var_desc->Name();
           cl_image->InitEmptyImage(context, command_queue, ddim);
         }
       }
diff --git a/mobile/src/framework/load_ops.h b/mobile/src/framework/load_ops.h
index b871d2af140730850dfac0fd43383e48012c9ef0..e04db5d1e8d6e2a75343cbee15269d607f71b7c9 100755
--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
@@ -246,7 +246,7 @@ LOAD_OP2(fusion_conv_bn, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_bn);
 #endif
 #ifdef ELEMENTWISESUB_OP
-LOAD_OP1(elementwise_sub, CPU)
+LOAD_OP2(elementwise_sub, CPU, GPU_CL)
 #endif
 #ifdef TOP_K_OP
 LOAD_OP1(top_k, CPU)
@@ -380,3 +380,9 @@ LOAD_OP1(reduce_prod, CPU);
 #ifdef PIXEL_SHUFFLE_OP
 LOAD_OP1(pixel_shuffle, GPU_CL);
 #endif
+#ifdef EXPAND_OP
+LOAD_OP1(expand, GPU_CL);
+#endif
+#ifdef GRID_SAMPLER_OP
+LOAD_OP1(grid_sampler, GPU_CL);
+#endif
diff --git a/mobile/src/framework/loader.cpp b/mobile/src/framework/loader.cpp
index 34cf6253cb4571c3b52fe61161cba3e140eb0110..31274743f8b1d4b3d8195526e1ae77129c2729bb 100644
--- a/mobile/src/framework/loader.cpp
+++ b/mobile/src/framework/loader.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "framework/loader.h"
+#include <memory>
 
 #include "framework/lod_tensor.h"
 #include "framework/program/program-optimize/program_optimize.h"
@@ -173,7 +174,7 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
   rewind(fp);
 
   DLOG << "model size: " << size;
-
+  PADDLE_MOBILE_ENFORCE(size > 0, "model size should > 0")
   *out = reinterpret_cast<uint8_t *>(malloc(size));
 
   size_t cur_len = 0;
diff --git a/mobile/src/framework/operator.cpp b/mobile/src/framework/operator.cpp
index 402512c7237be0ca26470361cc16369bd97f7758..a091a49b35203445cda48b2387413193079ecd5e 100644
--- a/mobile/src/framework/operator.cpp
+++ b/mobile/src/framework/operator.cpp
@@ -62,31 +62,39 @@ void OperatorBase<Dtype>::Run() {
   DLOG << "-------------" << type_ << "----------------------------";
   vector<string> input_keys = GetInputKeys();
   for (const auto key : input_keys) {
-    auto var_vec_in = inputs_.at(key);
-    for (int i = 0; i < var_vec_in.size(); ++i) {
-      auto var = this->scope_->FindVar(var_vec_in[i]);
-      if (var->IsInitialized() &&
-          var->template IsType<framework::LoDTensor>()) {
-        const Tensor *tensor = var->template Get<framework::LoDTensor>();
-        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+    if (inputs_.count(key) > 0) {
+      auto var_vec_in = inputs_.at(key);
+      for (int i = 0; i < var_vec_in.size(); ++i) {
+        auto var = this->scope_->FindVar(var_vec_in[i]);
+        if (var->IsInitialized() &&
+            var->template IsType<framework::LoDTensor>()) {
+          const Tensor *tensor = var->template Get<framework::LoDTensor>();
+          if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
 #ifdef PADDLE_MOBILE_FPGA
-        DLOG << var_vec_in[i];
+          DLOG << var_vec_in[i];
 #endif
+        }
       }
+    } else {
+      DLOG << "did not find key (" << key << ") in inputs_";
     }
   }
   for (const auto key : GetOutKeys()) {
-    auto var_vec_out = outputs_.at(key);
-    for (int i = 0; i < var_vec_out.size(); ++i) {
-      auto var = scope_->FindVar(var_vec_out[i]);
-      if (var->IsInitialized() &&
-          var->template IsType<framework::LoDTensor>()) {
-        const Tensor *tensor = var->template Get<framework::LoDTensor>();
-        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
+    if (outputs_.count(key) > 0) {
+      auto var_vec_out = outputs_.at(key);
+      for (int i = 0; i < var_vec_out.size(); ++i) {
+        auto var = scope_->FindVar(var_vec_out[i]);
+        if (var->IsInitialized() &&
+            var->template IsType<framework::LoDTensor>()) {
+          const Tensor *tensor = var->template Get<framework::LoDTensor>();
+          if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
 #ifdef PADDLE_MOBILE_FPGA
-        DLOG << var_vec_out[i];
+          DLOG << var_vec_out[i];
 #endif
+        }
       }
+    } else {
+      DLOG << "did not find key (" << key << ") in outputs_";
     }
   }
 #endif
@@ -100,27 +108,37 @@ void OperatorBase<GPU_CL>::Run() {
   DLOG << "-------------" << type_ << "----------------------------";
   vector<string> input_keys = GetInputKeys();
   for (const auto key : input_keys) {
-    auto var_vec_in = inputs_.at(key);
-    for (int i = 0; i < var_vec_in.size(); ++i) {
-      auto var = scope_->FindVar(var_vec_in[i]);
-      if (var->IsInitialized() && var->template IsType<framework::CLImage>()) {
-        const CLImage *cl_image = var->template Get<framework::CLImage>();
-        if (cl_image) {
-          DLOG << type_ << " input- " << key << "=" << *cl_image;
+    if (inputs_.count(key) > 0) {
+      auto var_vec_in = inputs_.at(key);
+      for (int i = 0; i < var_vec_in.size(); ++i) {
+        auto var = scope_->FindVar(var_vec_in[i]);
+        if (var->IsInitialized() &&
+            var->template IsType<framework::CLImage>()) {
+          const CLImage *cl_image = var->template Get<framework::CLImage>();
+          if (cl_image) {
+            DLOG << type_ << " input- " << key << "=" << *cl_image;
+          }
         }
       }
+    } else {
+      DLOG << "did not find key (" << key << ") in inputs_";
     }
   }
   for (const auto key : GetOutKeys()) {
-    auto var_vec_out = outputs_.at(key);
-    for (int i = 0; i < var_vec_out.size(); ++i) {
-      auto var = scope_->FindVar(var_vec_out[i]);
-      if (var->IsInitialized() && var->template IsType<framework::CLImage>()) {
-        const CLImage *cl_image = var->template Get<framework::CLImage>();
-        if (cl_image) {
-          DLOG << type_ << " output- " << key << "=" << *cl_image;
+    if (outputs_.count(key) > 0) {
+      auto var_vec_out = outputs_.at(key);
+      for (int i = 0; i < var_vec_out.size(); ++i) {
+        auto var = scope_->FindVar(var_vec_out[i]);
+        if (var->IsInitialized() &&
+            var->template IsType<framework::CLImage>()) {
+          const CLImage *cl_image = var->template Get<framework::CLImage>();
+          if (cl_image) {
+            DLOG << type_ << " output- " << key << "=" << *cl_image;
+          }
         }
       }
+    } else {
+      DLOG << "did not find key (" << key << ") in outputs_";
     }
   }
 #endif
diff --git a/mobile/src/io/api_paddle_mobile.cc b/mobile/src/io/api_paddle_mobile.cc
index 8bfc91998f600726c1bcf8fe932372928928e334..b01407bb3759eb18552a8d51f4826f69bb1bbe5f 100644
--- a/mobile/src/io/api_paddle_mobile.cc
+++ b/mobile/src/io/api_paddle_mobile.cc
@@ -262,6 +262,37 @@ void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {
   paddle_mobile_->Predict_From_To(start, end);
 }
 
+#else
+template <typename Device, typename T>
+void PaddleMobilePredictor<Device, T>::Feed(const std::string &var_name,
+                                            const PaddleTensor &input) {
+  framework::DDim ddim = framework::make_ddim(input.shape);
+  framework::Tensor input_tensor(static_cast<T *>(input.data.data()), ddim);
+  paddle_mobile_->Feed(var_name, input_tensor);
+}
+
+template <typename Device, typename T>
+void PaddleMobilePredictor<Device, T>::Fetch(const std::string &var_name,
+                                             PaddleTensor *output) {
+  auto output_tensor = paddle_mobile_->Fetch(var_name);
+  auto ddim = output_tensor->dims();
+
+  output->shape.clear();
+  for (int i = 0; i < ddim.size(); i++) {
+    output->shape.push_back(static_cast<int>(ddim[i]));
+  }
+
+  int length = output_tensor->numel() * sizeof(T);
+  if (output->data.length() < length) {
+    output->data.Resize(length);
+  }
+  memcpy(output->data.data(), output_tensor->template data<T>(), length);
+}
+
+template <typename Device, typename T>
+bool PaddleMobilePredictor<Device, T>::Run() {
+  paddle_mobile_->Predict();
+}
 #endif
 template <typename Device, typename T>
 PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
diff --git a/mobile/src/io/api_paddle_mobile.h b/mobile/src/io/api_paddle_mobile.h
index 63718acd990de664bc06f1af973755aa4336a184..6a33e2812a0a8726d8db83d51a5ea2400633e30e 100644
--- a/mobile/src/io/api_paddle_mobile.h
+++ b/mobile/src/io/api_paddle_mobile.h
@@ -39,7 +39,10 @@ class PaddleMobilePredictor : public PaddlePredictor {
   void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) override;
   void FetchPaddleTensors(PaddleTensor* outputs, int id) override;
   void GetPaddleTensor(const std::string& name, PaddleTensor* output) override;
-
+#else
+  void Feed(const std::string& var_name, const PaddleTensor& input);
+  void Fetch(const std::string& var_name, PaddleTensor* output);
+  bool Run();
 #endif
 
   ~PaddleMobilePredictor() override;
diff --git a/mobile/src/io/opencl_interface.cpp b/mobile/src/io/opencl_interface.cpp
index 1df5b48339b9b8d82c3e0cc4452c1f7876458ece..636cd1b760801497932606a1cfaae047ed85a994 100644
--- a/mobile/src/io/opencl_interface.cpp
+++ b/mobile/src/io/opencl_interface.cpp
@@ -28,8 +28,26 @@ cl_command_queue getClCommandQueue() {
 }
 
 bool isInitSuccess() {
+  prepareOpenclRuntime();
   return framework::CLEngine::Instance()->isInitSuccess();
 }
 
+bool prepareOpenclRuntime() {
+#ifdef PREPARE_OPENCL_RUNTIME
+  DLOG << "cl runtime prepared. ";
+  cl_uint numPlatforms;  // the NO. of platforms
+  cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
+  if (status == CL_SUCCESS) {
+    if (numPlatforms > 0) {
+      cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
+          malloc(numPlatforms * sizeof(cl_platform_id)));
+      status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+      free(platforms);
+    }
+  }
+#endif
+  return true;
+}
+
 }  // namespace paddle_mobile
 #endif
diff --git a/mobile/src/io/opencl_interface.h b/mobile/src/io/opencl_interface.h
index f1039f1373df6b65a5fc7f4e01279badfaa40307..6a3608790a98638e207fd20dd6f9f05ea54d9e3d 100644
--- a/mobile/src/io/opencl_interface.h
+++ b/mobile/src/io/opencl_interface.h
@@ -21,6 +21,7 @@ namespace paddle_mobile {
 cl_context getContext();
 cl_command_queue getClCommandQueue();
 bool isInitSuccess();
+bool prepareOpenclRuntime();
 
 }  // namespace paddle_mobile
 
diff --git a/mobile/src/io/paddle_inference_api.h b/mobile/src/io/paddle_inference_api.h
index c89b998144badcf7b88dbbfcaa631a25df7892d5..6f3ba182f6f3ff41763ec950f2632ae288bdf03b 100644
--- a/mobile/src/io/paddle_inference_api.h
+++ b/mobile/src/io/paddle_inference_api.h
@@ -191,6 +191,10 @@ class PaddlePredictor {
   virtual void FetchPaddleTensors(PaddleTensor* outputs, int id) = 0;
   virtual void GetPaddleTensor(const std::string& name,
                                PaddleTensor* output) = 0;
+#else
+  virtual void Feed(const std::string& var_name, const PaddleTensor& input) = 0;
+  virtual void Fetch(const std::string& var_name, PaddleTensor* output) = 0;
+  virtual bool Run() = 0;
 #endif
 
  protected:
diff --git a/mobile/src/io/paddle_mobile.h b/mobile/src/io/paddle_mobile.h
index 8b8f0683abd12d9516e2a2cb09078241c2b7944e..8c40b0696ad0f4daf782a71a1816b66a3a2c95df 100644
--- a/mobile/src/io/paddle_mobile.h
+++ b/mobile/src/io/paddle_mobile.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "io/paddle_inference_api.h"
 #ifdef PADDLE_MOBILE_CL
 #include "framework/cl/cl_engine.h"
+#include "io/opencl_interface.h"
 #endif
 
 namespace paddle_mobile {
@@ -34,16 +35,24 @@ template <typename Device, typename T = float>
 class PaddleMobile {
  public:
   explicit PaddleMobile(PaddleMobileConfigInternal config) : config_(config) {
-#ifndef PADDLE_MOBILE_CL
     bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
+#ifndef PADDLE_MOBILE_CL
     PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
+#else
+    if (is_gpu) {
+      prepareOpenclRuntime();
+    }
 #endif
   }
 
   PaddleMobile() {
-#ifndef PADDLE_MOBILE_CL
     bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
+#ifndef PADDLE_MOBILE_CL
     PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
+#else
+    if (is_gpu) {  // recheck when run cpu in with opencl.
+      prepareOpenclRuntime();
+    }
 #endif
   }
   virtual ~PaddleMobile() { Clear(); }
diff --git a/mobile/src/operators/elementwise_sub_op.cpp b/mobile/src/operators/elementwise_sub_op.cpp
index 9b9d89073a637fb769687684ead23829e5445c90..6962e69a8de5522aeff912fe84484e36879300d4 100644
--- a/mobile/src/operators/elementwise_sub_op.cpp
+++ b/mobile/src/operators/elementwise_sub_op.cpp
@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(elementwise_sub, ops::ElementwiseSubOp);
+#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
 
diff --git a/mobile/src/operators/expand_op.cpp b/mobile/src/operators/expand_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1d8b76fd6299cadfd3977f8e804c7aa0e7b5cc6
--- /dev/null
+++ b/mobile/src/operators/expand_op.cpp
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef EXPAND_OP
+
+#include "operators/expand_op.h"
+#include <framework/ddim.h>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ExpandOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+
+  int expand_size = this->param_.expand_times.size();
+  int x_dims_size = x_dim.size();
+  PADDLE_MOBILE_ENFORCE(expand_size == x_dims_size,
+                        "The number of expand_times size must be qual to the "
+                        "rank of Input(X). The number of expand_times size "
+                        "must be qual to the rank of Input(X).")
+
+  framework::DDim out_dims(this->param_.InputX()->dims());
+  for (size_t i = 0; i < this->param_.expand_times.size(); ++i) {
+    out_dims[i] *= this->param_.expand_times[i];
+  }
+  this->param_.Out()->Resize(out_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(expand, ops::ExpandOp);
+#endif
+
+#endif
diff --git a/mobile/src/operators/expand_op.h b/mobile/src/operators/expand_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d504000079bc79564c4f58e0133d37ff8634e5c4
--- /dev/null
+++ b/mobile/src/operators/expand_op.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef EXPAND_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/expand_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+#ifdef EXPAND_OP
+DECLARE_OPERATOR(Expand, ExpandParam, ExpandKernel);
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/mobile/src/operators/fusion_instancenorm_relu_op.h b/mobile/src/operators/fusion_instancenorm_relu_op.h
index ce2623e4dda46a0952fede3e1a25012ed5da4394..91551e65586b822d75336450b4cd0db2a7dd7d26 100644
--- a/mobile/src/operators/fusion_instancenorm_relu_op.h
+++ b/mobile/src/operators/fusion_instancenorm_relu_op.h
@@ -45,7 +45,7 @@ class FusionInstanceNormReluMatcher : public framework::FusionOpMatcher {
 template <typename DeviceType, typename T>
 class FusionInstanceNormReluOp
     : public framework::OperatorWithKernel<
-          DeviceType, InstanceNormParam<DeviceType>,
+          DeviceType, FusionInstanceNormReluParam<DeviceType>,
           operators::InstanceNormReluKernel<DeviceType, T>> {
  public:
   FusionInstanceNormReluOp(const string &type, const VariableNameMap &inputs,
@@ -53,7 +53,7 @@ class FusionInstanceNormReluOp
                            const framework::AttributeMap &attrs,
                            framework::Scope *scope)
       : framework::OperatorWithKernel<
-            DeviceType, InstanceNormParam<DeviceType>,
+            DeviceType, FusionInstanceNormReluParam<DeviceType>,
             operators::InstanceNormReluKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
diff --git a/mobile/src/operators/grid_sampler_op.cpp b/mobile/src/operators/grid_sampler_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..90809f1d4c6495ab06a9a630681f9b07a31d2f01
--- /dev/null
+++ b/mobile/src/operators/grid_sampler_op.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef GRID_SAMPLER_OP
+
+#include "operators/grid_sampler_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void GridSamplerOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Output()->Resize(x_dim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(grid_sampler, ops::GridSamplerOp);
+#endif
+
+#endif
diff --git a/mobile/src/operators/grid_sampler_op.h b/mobile/src/operators/grid_sampler_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d142b9d47466d472234dfc5d214de1032b0c6e9
--- /dev/null
+++ b/mobile/src/operators/grid_sampler_op.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef GRID_SAMPLER_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/grid_sampler_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+#ifdef GRID_SAMPLER_OP
+DECLARE_OPERATOR(GridSampler, GridSamplerParam, GridSamplerKernel);
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/mobile/src/operators/instancenorm_op.cpp b/mobile/src/operators/instancenorm_op.cpp
index 82cdf36f47414771eb6829751e04bd559c6ff29e..42af75ca21ba4a70a78c50fa34ab674278bea743 100644
--- a/mobile/src/operators/instancenorm_op.cpp
+++ b/mobile/src/operators/instancenorm_op.cpp
@@ -24,7 +24,7 @@ namespace operators {
 template <typename Dtype, typename T>
 void InstanceNormOp<Dtype, T>::InferShape() const {
   auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
+  this->param_.OutputY()->Resize(x_dims);
 }
 
 }  // namespace operators
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_common.cpp b/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
index c0906e23a39040c76729756f05bcad9b7bdd4b07..dd3843afef39573c03544f30501446436040ad94 100644
--- a/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef CONV_OP
+
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "framework/context.h"
 #include "operators/math/gemm/gemm1x1s1.h"
@@ -111,3 +113,4 @@ void InitBaseConvKernel(ConvParam<CPU> *param) {
 
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
index 43bdbd532a8ee2d615c4ef26d9fd1e7a37edf62d..606a7f1ddc2870562d27efd71b8f70cd921ffa58 100644
--- a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
+++ b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONV_OP
 
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
 #include <vector>
@@ -375,3 +376,4 @@ template void DepthwiseConv5x5<int8_t, int32_t>(const ConvParam<CPU> &param);
 
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
index 5c92cdbfd0001cc277d59fc9a6d5c526a43b61ed..a4dfd8321edbcc24b1d942bbe55abbdddba009c1 100644
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONV_OP
 
 #include "operators/kernel/cl/cl-kernel-func/conv_func.h"
 #include <vector>
@@ -240,7 +241,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
   cl_int status;
   int index = 0;
 
-  if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
+  const int filter_height = param.Filter()->dims()[2];
+  const int filter_width = param.Filter()->dims()[3];
+  if (filter_height == 1 && filter_width == 1) {
     status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
     CL_CHECK_ERRORS(status);
 
@@ -403,7 +406,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
     status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
     CL_CHECK_ERRORS(status);
 
-    if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) {
+    if (filter_height == 3 && filter_width == 3) {
       // normal conv
       if (param.Filter()->dims()[0] == param.Output()->dims()[1] &&
           param.Filter()->dims()[1] == param.Input()->dims()[1]) {
@@ -424,6 +427,17 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
         status = clSetKernelArg(kernel, index++, sizeof(int), &group);
         CL_CHECK_ERRORS(status);
       }
+    } else if (filter_height != 3 && filter_width != 3) {
+      // not 3x3
+      if (param.Filter()->dims()[1] == 1 &&
+          param.Input()->dims()[1] == param.Output()->dims()[1]) {
+        // deepwise basic use in not 3x3
+        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
+        CL_CHECK_ERRORS(status);
+
+        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
+        CL_CHECK_ERRORS(status);
+      }
     }
 
     status = clEnqueueNDRangeKernel(
@@ -1123,3 +1137,4 @@ void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper,
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
index 84c3230d82bd2bfb54210e3e57ecf95bb43b7ff9..1f25d3436e6668bf3c01a8dbe3484dbb92fa921d 100644
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
@@ -11,23 +11,23 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#ifdef INSTANCENORM_OP
 #include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
 #include <algorithm>
 namespace paddle_mobile {
 namespace operators {
 void InstanceNorm(framework::CLHelper *cl_helper,
-                  const InstanceNormParam<GPU_CL> &param) {
+                  const framework::CLImage *input, framework::CLImage *output,
+                  float epsilon) {
   auto kernel = cl_helper->KernelAt(0);
 
-  auto &dims = param.Out()->dims();
+  auto &dims = output->dims();
   const int n = dims[0];
   const int c_group = (dims[1] + 3) / 4;
   const int h = dims[2];
   const int w = dims[3];
-  auto epsilon = param.Epsilon();
-  auto input = param.InputX()->GetCLImage();
-  auto out = param.Out()->GetCLImage();
+  auto input_image = input->GetCLImage();
+  auto out_image = output->GetCLImage();
 
   //      DLOG << "Epsilon: " << epsilon;
 
@@ -66,12 +66,13 @@ void InstanceNorm(framework::CLHelper *cl_helper,
   CL_CHECK_ERRORS(status);
   clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
   CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input_image);
   CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
+  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out_image);
   CL_CHECK_ERRORS(status);
   clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
                          work_size, local_work_size, 0, NULL, NULL);
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
index 45c0bcd4e8e8ea0d6c24904b4fa7fc763d3e9bc1..1e46ebf4ba497b44699a33adf27dd21830e1e3a4 100644
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
@@ -21,7 +21,8 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 void InstanceNorm(framework::CLHelper *cl_helper,
-                  const InstanceNormParam<GPU_CL> &param);
+                  const framework::CLImage *input, framework::CLImage *output,
+                  float epsilon);
 }
 }  // namespace paddle_mobile
 #endif
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
old mode 100755
new mode 100644
index bdace5b5408d8676f9fe981c7c4de6415aba8868..bf31f329708aacac59f3a67cf987998a8a4a28dd
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -24,980 +24,1101 @@ conv_add_bn_relu
 
 #include "cl_common.h"
 
-__kernel void conv_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter,
-
+__kernel void conv_3x3(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height,
-                                              __private const int output_c,
-                                              __private const int filter_channel,
-                                              __private const int group) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __private const int output_c, __private const int filter_channel,
+    __private const int group) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
 
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
 
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
 
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 
 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
-#endif
-
-    half4 input[9];
-    if (group == 1) {
-        for (int i = 0; i < input_c; ++i) {
-            int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-            input[0] = select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[1] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y - dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[2] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[3] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x - dilation, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[4] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[5] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[6] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-            input[7] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-            input[8] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-
-/*
-            for (int j = 0; j < 9; ++j) {
-                int2 pos_of_weight;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-            }
-*/
-                int j = 0;
-                int2 pos_of_weight;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 1;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 2;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 3;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 4;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 5;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-               j = 6;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
-
-               j = 7;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
-
-               j = 8;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
+  half4 output = 0.0f;
+#endif
 
+  half4 input[9];
+  if (group == 1) {
+    for (int i = 0; i < input_c; ++i) {
+      int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                           in_pos_in_one_block.y);
+      input[0] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
+
+      input[1] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x, pos_in.y - dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
+
+      input[2] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
+
+      input[3] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x - dilation, pos_in.y)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
+
+      input[4] = select(
+          read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+          (half4)(0.0f),
+          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                     in_pos_in_one_block.x >= input_width ||
+                     in_pos_in_one_block.y >= input_height)
+                    << 15));
+
+      input[5] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x + dilation, pos_in.y)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
+
+      input[6] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
+
+      input[7] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x, pos_in.y + dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
+
+      input[8] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
+
+      /*
+                  for (int j = 0; j < 9; ++j) {
+                      int2 pos_of_weight;
+                      pos_of_weight.x = i * 3 + j % 3;
+                      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                      half4 weight_x = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.x += dot(input[j], weight_x);
+
+                      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                      half4 weight_y = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.y += dot(input[j], weight_y);
+
+                      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                      half4 weight_z = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.z += dot(input[j], weight_z);
+
+                      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                      half4 weight_w = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.w += dot(input[j], weight_w);
+                  }
+      */
+      int j = 0;
+      int2 pos_of_weight;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 1;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 2;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 3;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 4;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 5;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 6;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 7;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 8;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+    }
+  } else {
+    for (int i = 0; i < 4; i++) {
+      int used_input_channel_num =
+          (out_c * 4 + i) / (output_c / group) * filter_channel;
+      for (int f_c = 0; f_c < filter_channel; ++f_c) {
+        int input_c = used_input_channel_num + f_c;
+        int input_block = input_c / 4;
+        int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
+                             in_pos_in_one_block.y);
+        input[0] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                       in_pos_in_one_block.y - dilation < 0 ||
+                       in_pos_in_one_block.x - dilation >= input_width ||
+                       in_pos_in_one_block.y - dilation >= input_height)
+                      << 15));
+        input[1] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x, pos_in.y - dilation)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x < 0 ||
+                              in_pos_in_one_block.y - dilation < 0 ||
+                              in_pos_in_one_block.x >= input_width ||
+                              in_pos_in_one_block.y - dilation >= input_height)
+                             << 15));
+        input[2] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                       in_pos_in_one_block.y - dilation < 0 ||
+                       in_pos_in_one_block.x + dilation >= input_width ||
+                       in_pos_in_one_block.y - dilation >= input_height)
+                      << 15));
+        input[3] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x - dilation, pos_in.y)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                              in_pos_in_one_block.y < 0 ||
+                              in_pos_in_one_block.x - dilation >= input_width ||
+                              in_pos_in_one_block.y >= input_height)
+                             << 15));
+        input[4] = select(
+            read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                       in_pos_in_one_block.x >= input_width ||
+                       in_pos_in_one_block.y >= input_height)
+                      << 15));
+        input[5] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x + dilation, pos_in.y)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                              in_pos_in_one_block.y < 0 ||
+                              in_pos_in_one_block.x + dilation >= input_width ||
+                              in_pos_in_one_block.y >= input_height)
+                             << 15));
+        input[6] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                       in_pos_in_one_block.y + dilation < 0 ||
+                       in_pos_in_one_block.x - dilation >= input_width ||
+                       in_pos_in_one_block.y + dilation >= input_height)
+                      << 15));
+        input[7] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x, pos_in.y + dilation)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x < 0 ||
+                              in_pos_in_one_block.y + dilation < 0 ||
+                              in_pos_in_one_block.x >= input_width ||
+                              in_pos_in_one_block.y + dilation >= input_height)
+                             << 15));
+        input[8] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                       in_pos_in_one_block.y + dilation < 0 ||
+                       in_pos_in_one_block.x + dilation >= input_width ||
+                       in_pos_in_one_block.y + dilation >= input_height)
+                      << 15));
+
+        half tmp_out = 0;
+        for (int j = 0; j < 9; j++) {
+          int2 pos_of_weight;
+          pos_of_weight.x = (f_c / 4) * 3 + j % 3;
+          pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
+          half4 weight = read_imageh(filter, sampler, pos_of_weight);
+          int f_c_offset = f_c % 4;
+          half f_value;
+          if (f_c_offset == 0) {
+            f_value = weight.x;
+          } else if (f_c_offset == 1) {
+            f_value = weight.y;
+          } else if (f_c_offset == 2) {
+            f_value = weight.z;
+          } else if (f_c_offset == 3) {
+            f_value = weight.w;
+          }
+          int input_c_offset = input_c % 4;
+          half input_value;
+          if (input_c_offset == 0) {
+            input_value = input[j].x;
+          } else if (input_c_offset == 1) {
+            input_value = input[j].y;
+          } else if (input_c_offset == 2) {
+            input_value = input[j].z;
+          } else if (input_c_offset == 3) {
+            input_value = input[j].w;
+          }
+          tmp_out += f_value * input_value;
         }
-    } else {
-        for (int i = 0; i < 4; i++) {
-            int used_input_channel_num = (out_c * 4 + i) / (output_c / group) * filter_channel;
-            for (int f_c = 0; f_c < filter_channel; ++f_c) {
-                int input_c = used_input_channel_num + f_c;
-                int input_block = input_c / 4;
-                int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-                input[0] = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                                    (half4)(0.0f),
-                                    (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[1] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y - dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[2] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[3] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x - dilation, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[4] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[5] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[6] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-                input[7] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-                input[8] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-                half tmp_out = 0;
-                for (int j = 0; j < 9; j++) {
-                    int2 pos_of_weight;
-                    pos_of_weight.x = (f_c / 4) * 3 + j % 3;
-                    pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
-                    half4 weight = read_imageh(filter, sampler, pos_of_weight);
-                    int f_c_offset = f_c % 4;
-                    half f_value;
-                    if (f_c_offset == 0) {
-                        f_value = weight.x;
-                    } else if (f_c_offset == 1) {
-                        f_value = weight.y;
-                    } else if (f_c_offset == 2) {
-                        f_value = weight.z;
-                    } else if (f_c_offset == 3) {
-                        f_value = weight.w;
-                    }
-                    int input_c_offset = input_c % 4;
-                    half input_value;
-                    if (input_c_offset == 0) {
-                        input_value = input[j].x;
-                    } else if (input_c_offset == 1) {
-                        input_value = input[j].y;
-                    } else if (input_c_offset == 2) {
-                        input_value = input[j].z;
-                    } else if (input_c_offset == 3) {
-                        input_value = input[j].w;
-                    }
-                    tmp_out += f_value * input_value;
-                }
-
-                if (i == 0) {
-                    output.x += tmp_out;
-                } else if (i == 1) {
-                    output.y += tmp_out;
-                } else if (i == 2) {
-                    output.z += tmp_out;
-                } else if (i == 3) {
-                    output.w += tmp_out;
-                }
-            }
+
+        if (i == 0) {
+          output.x += tmp_out;
+        } else if (i == 1) {
+          output.y += tmp_out;
+        } else if (i == 2) {
+          output.z += tmp_out;
+        } else if (i == 3) {
+          output.w += tmp_out;
         }
+      }
     }
-
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif
 
-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }
 
-   // dilation == 1
-__kernel void conv_3x3spl(__private const int item_ch,
-                               __private const int item_w,
-                               __private const int item_h,
-                               __read_only image2d_t input_image,
-                               __read_only image2d_t filter_image,
+// dilation == 1
+__kernel void conv_3x3spl(
+    __private const int item_ch, __private const int item_w,
+    __private const int item_h, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                               __write_only image2d_t output_image,
-                               __private const int stride,
-                               __private const int pad,
-                               __private const int dilation,
-                               __private const int in_ch,
-                               __private const int in_w,
-                               __private const int in_h,
-                               __private const int out_w,
-                               __private const int out_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_width_id_per_blk and out_batch_id
-    int out_batch_id = item_h_id / in_h;
-    int out_w_base_id = item_ch_id * out_w;
-    int out_w_id0 = item_w_id;
-    int out_w_id1 = out_w_id0 + item_w;
-    int out_w_id2 = out_w_id1 + item_w;
-    int out_w_id3 = out_w_id2 + item_w;
-    int out_w_id4 = out_w_id3 + item_w;
-
-    // in_width_id_per_blk and in_height_id_per_batch
-    int in_h_id = (item_h_id % out_h) * stride - pad;
-    int in_w_id0 = item_w_id * stride - pad;
-    int in_w_id1 = in_w_id0 + item_w * stride;
-    int in_w_id2 = in_w_id1 + item_w * stride;
-    int in_w_id3 = in_w_id2 + item_w * stride;
-    int in_w_id4 = in_w_id3 + item_w * stride;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int pad, __private const int dilation,
+    __private const int in_ch, __private const int in_w,
+    __private const int in_h, __private const int out_w,
+    __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
 
 #ifdef BIASE_CH
 
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-    output[1] = output[0];
-    output[2] = output[0];
-    output[3] = output[0];
-    output[4] = output[0];
+  half4 output[5];
+  output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
 
 #elif defined(BIASE_ELE)
 
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
-    if (out_w_id1 < out_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
-    }
-    if (out_w_id2 < out_w) {
-        output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
-    }
-    if (out_w_id3 < out_w) {
-        output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
-    }
-    if (out_w_id4 < out_w) {
-        output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
-    }
+  half4 output[5];
+  output[0] =
+      read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
 #else
-    half4 output[5] = {0.0f};
-#endif
-
-    half4 filter[4] = {0.0f};
-    half4 filter_trans[4] = {0.0f};
-    half4 input[5] = {0.0f};
-
-    int filter_h_val0 = item_ch_id * 4 * 3;
-    int filter_h_val1 = filter_h_val0 + 3;
-    int filter_h_val2 = filter_h_val1 + 3;
-    int filter_h_val3 = filter_h_val2 + 3;
-
-    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-        int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
-
-        const int in_w_base_id = mul24(ch, in_w);
-
-        int filter_w_val = ch * 3;
-
-        for (int h = 0; h < 3; h++) {
-
-            int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
-                                 (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
-
-            for (int w = 0; w < 3; w++) {
-
-                int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
-                                  (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-                int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
-                                   (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-                int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
-                                   (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-                int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
-                                   (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-                int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
-                                   (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
-
-                filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
-                filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
-                filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
-                filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
-
-                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);    // in_ch:0,out_ch:0-3
-                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);    // in_ch:1,out_ch:0-3
-                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);    // in_ch:2,out_ch:0-3
-                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);    // in_ch:3,out_ch:0-3
-
-                input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
-                input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
-                input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
-                input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
-                input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
-
-                output[0] = mad(input[0].x, filter_trans[0], output[0]);
-                output[1] = mad(input[1].x, filter_trans[0], output[1]);
-                output[2] = mad(input[2].x, filter_trans[0], output[2]);
-                output[3] = mad(input[3].x, filter_trans[0], output[3]);
-                output[4] = mad(input[4].x, filter_trans[0], output[4]);
-
-                if (ch_surplus < 3) {
-                    output[0] = mad(input[0].y, filter_trans[1], output[0]);
-                    output[1] = mad(input[1].y, filter_trans[1], output[1]);
-                    output[2] = mad(input[2].y, filter_trans[1], output[2]);
-                    output[3] = mad(input[3].y, filter_trans[1], output[3]);
-                    output[4] = mad(input[4].y, filter_trans[1], output[4]);
-                }
-                if (ch_surplus < 2) {
-                    output[0] = mad(input[0].z, filter_trans[2], output[0]);
-                    output[1] = mad(input[1].z, filter_trans[2], output[1]);
-                    output[2] = mad(input[2].z, filter_trans[2], output[2]);
-                    output[3] = mad(input[3].z, filter_trans[2], output[3]);
-                    output[4] = mad(input[4].z, filter_trans[2], output[4]);
-                }
-                if (ch_surplus < 1) {
-                    output[0] = mad(input[0].w, filter_trans[3], output[0]);
-                    output[1] = mad(input[1].w, filter_trans[3], output[1]);
-                    output[2] = mad(input[2].w, filter_trans[3], output[2]);
-                    output[3] = mad(input[3].w, filter_trans[3], output[3]);
-                    output[4] = mad(input[4].w, filter_trans[3], output[4]);
-                }
-            }
+  half4 output[5] = {0.0f};
+#endif
+
+  half4 filter[4] = {0.0f};
+  half4 filter_trans[4] = {0.0f};
+  half4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * 3;
+  int filter_h_val1 = filter_h_val0 + 3;
+  int filter_h_val2 = filter_h_val1 + 3;
+  int filter_h_val3 = filter_h_val2 + 3;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * 3;
+
+    for (int h = 0; h < 3; h++) {
+      int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
+                            (out_batch_id * in_h + in_h_id + h < 0 ||
+                             out_batch_id * in_h + in_h_id + h >= in_h));
+
+      for (int w = 0; w < 3; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
+                                  filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
+                                  filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
+                                  filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
+                                  filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] =
+            read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] =
+            read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] =
+            read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] =
+            read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] =
+            read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
         }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
     }
+  }
 #ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (out_w_id1 < out_w) {
-        output[1] =  mad(scale, output[1], biase);
-    }
-    if (out_w_id2 < out_w) {
-        output[2] =  mad(scale, output[2], biase);
-    }
-    if (out_w_id3 < out_w) {
-        output[3] =  mad(scale, output[3], biase);
-    }
-    if (out_w_id4 < out_w) {
-        output[4] =  mad(scale, output[4], biase);
-    }
+  half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
+  half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
+  output[0] = mad(scale, output[0], biase);
+  if (out_w_id1 < out_w) {
+    output[1] = mad(scale, output[1], biase);
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = mad(scale, output[2], biase);
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = mad(scale, output[3], biase);
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = mad(scale, output[4], biase);
+  }
 #endif
 
 #ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
-    output[2] = activation(output[2]);
-    output[3] = activation(output[3]);
-    output[4] = activation(output[4]);
-#endif
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
-    if (out_w_id1 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
-    }
-    if (out_w_id2 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
-    }
-    if (out_w_id3 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
-    }
-    if (out_w_id4 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
-    }
+  output[0] = activation(output[0]);
+  output[1] = activation(output[1]);
+  output[2] = activation(output[2]);
+  output[3] = activation(output[3]);
+  output[4] = activation(output[4]);
+#endif
+  write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
+               output[0]);
+  if (out_w_id1 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
+                 output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
+                 output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
+                 output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
+                 output[4]);
+  }
 }
 
-
-
-__kernel void depth_conv_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
+__kernel void depth_conv_3x3(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height, /* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
 
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-    const int batch_index = out_nh / output_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    const int out_nh_in_one_batch = out_nh % output_height;
+  const int batch_index = out_nh / output_height;
 
+  const int out_nh_in_one_batch = out_nh % output_height;
 
-    int2 stride_xy = (int2)(stride, stride);
-    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
 
-    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
 
 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
+  half4 output = 0.0f;
 #endif
 
-    const int filter_width = 3;
-    const int filter_height = 3;
-
-    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
-
-    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
-
-    int filter_x = pos_in_filter_block.x ;
-    int filter_y = pos_in_filter_block.y ;
-
-    half4 inputs[9];
-
-        inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-        /*
-        if (output_pos.x == 112 && output_pos.y == 0) {
-              half4 input1 = inputs[3];
-              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-              printf(" input4 3 - %v4hlf \n", in);
-              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
-        }
-        */
-
-
-        inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-        inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-        inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-    half4 filters[9];
-    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
-    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
-    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
-    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
-    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
-    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
-    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
-    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
-    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
-
-    for(int i = 0 ;i < 9 ; i++){
-     output += inputs[i] * filters[i];
-    }
+  const int filter_width = 3;
+  const int filter_height = 3;
+
+  int2 pos_in_input_block =
+      (int2)(out_c * input_width, batch_index * input_height);
+
+  int2 pos_in_filter_block =
+      (int2)(out_c * filter_width, batch_index * filter_height);
+
+  int filter_x = pos_in_filter_block.x;
+  int filter_y = pos_in_filter_block.y;
+
+  half4 inputs[9];
+
+  inputs[0] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
+                                in_pos_in_one_block.y - 1 < 0 ||
+                                in_pos_in_one_block.x - 1 >= input_width ||
+                                in_pos_in_one_block.y - 1 >= input_height)
+                               << 15));
+
+  inputs[1] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y - 1 >= input_height)
+                << 15));
+
+  inputs[2] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
+                                in_pos_in_one_block.y - 1 < 0 ||
+                                in_pos_in_one_block.x + 1 >= input_width ||
+                                in_pos_in_one_block.y - 1 >= input_height)
+                               << 15));
+
+  inputs[3] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x - 1 >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+  /*
+  if (output_pos.x == 112 && output_pos.y == 0) {
+        half4 input1 = inputs[3];
+        float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+        printf(" input4 3 - %v4hlf \n", in);
+        printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
+  }
+  */
+
+  inputs[4] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                         pos_in_input_block.y + in_pos_in_one_block.y)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+
+  inputs[5] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x + 1 >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+
+  inputs[6] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
+                                in_pos_in_one_block.y + 1 < 0 ||
+                                in_pos_in_one_block.x - 1 >= input_width ||
+                                in_pos_in_one_block.y + 1 >= input_height)
+                               << 15));
+
+  inputs[7] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y + 1 >= input_height)
+                << 15));
+
+  inputs[8] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
+                                in_pos_in_one_block.y + 1 < 0 ||
+                                in_pos_in_one_block.x + 1 >= input_width ||
+                                in_pos_in_one_block.y + 1 >= input_height)
+                               << 15));
+
+  half4 filters[9];
+  filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
+  filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
+  filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
+  filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
+  filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
+  filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
+  filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
+  filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
+  filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
+
+  for (int i = 0; i < 9; i++) {
+    output += inputs[i] * filters[i];
+  }
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif
 
+  /*
+  if (output_pos.x == 112 && output_pos.y == 0) {
+      for (int i = 0; i < 9; ++i) {
+          half4 input1 = inputs[i];
+          float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+          printf(" input4 %d - %v4hlf \n", i, in);
+      }
+      float4 out = (float4)(output.x, output.y, output.z, output.w);
+      printf(" depth wise output output4 = %v4hlf \n", out);
+      printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
+      printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
+      printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
+      printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
+  }
+  */
 
-    /*
-
-    if (output_pos.x == 112 && output_pos.y == 0) {
-
-        for (int i = 0; i < 9; ++i) {
-            half4 input1 = inputs[i];
-            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-            printf(" input4 %d - %v4hlf \n", i, in);
-        }
-
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" depth wise output output4 = %v4hlf \n", out);
-        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
-        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
-        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
-        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
-    }
-
-    */
-
-    write_imageh(output_image, output_pos, output);
-
+  write_imageh(output_image, output_pos, output);
 }
 
-
-
-__kernel void depth_conv_3x3s1(__private const int ou_ch_blk,
-                                              __private const int ou_w_blk,
-                                              __private const int ou_nh,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
+__kernel void depth_conv_3x3s1(
+    __private const int ou_ch_blk, __private const int ou_w_blk,
+    __private const int ou_nh, __read_only image2d_t input,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int pad,
-                                              __private const int dilation,
-                                              __private const int in_ch,
-                                              __private const int in_w,/* of one block */
-                                              __private const int in_h, /* of one block */
-                                              __private const int ou_w,
-                                              __private const int ou_h) {
-
-    const int ou_ch_blk_id = get_global_id(0);
-    const int ou_w_blk_id = get_global_id(1);
-    const int ou_nh_id = get_global_id(2);
-    const int w_blk_size = 2;
-
-    const int batch_id = ou_nh_id / ou_h;
-    int ou_col_id = ou_w_blk_id * w_blk_size;
-    int ou_row_id = ou_nh_id % ou_h;
-    int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
-
-    // input pos in one block and on batch
-    int col_id = ou_col_id - pad;
-    int row_id = ou_row_id - pad;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-#ifdef BIASE_CH
-    half4 output[2];
-    output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
-    output[1] = output[0];
-#elif defined(BIASE_ELE)
-    half4 output[2];
-    output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
-    if (ou_col_id + 1 < ou_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
-    }
-#else
-    half4 output[2] = {0.0f};
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int pad, __private const int dilation,
+    __private const int in_ch, __private const int in_w, /* of one block */
+    __private const int in_h,                            /* of one block */
+    __private const int ou_w, __private const int ou_h) {
 
-    half4 inputs[12];
+  const int ou_ch_blk_id = get_global_id(0);
+  const int ou_w_blk_id = get_global_id(1);
+  const int ou_nh_id = get_global_id(2);
+  const int w_blk_size = 2;
 
-    int filter_x = ou_ch_blk_id * 3;
-    int filter_y = 0;
-    half4 filters[9];
-    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
-    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
-    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
+  const int batch_id = ou_nh_id / ou_h;
+  int ou_col_id = ou_w_blk_id * w_blk_size;
+  int ou_row_id = ou_nh_id % ou_h;
+  int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
 
-    int in_x = mad24(ou_ch_blk_id, in_w, col_id);
-    int in_y = mad24(batch_id, in_h, row_id);
+  // input pos in one block and on batch
+  int col_id = ou_col_id - pad;
+  int row_id = ou_row_id - pad;
 
-    int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
-    int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
-    inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
-    int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
-    inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
-    int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
-    inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
-    int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
-    inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    output[0] = mad(inputs[0], filters[0], output[0]);
-    output[1] = mad(inputs[1], filters[0], output[1]);
+#ifdef BIASE_CH
+  half4 output[2];
+  output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
+  output[1] = output[0];
+#elif defined(BIASE_ELE)
+  half4 output[2];
+  output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
+  if (ou_col_id + 1 < ou_w) {
+    output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
+  }
+#else
+  half4 output[2] = {0.0f};
+#endif
 
-    output[0] = mad(inputs[1], filters[1], output[0]);
-    output[1] = mad(inputs[2], filters[1], output[1]);
+  half4 inputs[12];
 
-    output[0] = mad(inputs[2], filters[2], output[0]);
-    output[1] = mad(inputs[3], filters[2], output[1]);
+  int filter_x = ou_ch_blk_id * 3;
+  int filter_y = 0;
+  half4 filters[9];
+  filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
+  filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
+  filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
 
+  int in_x = mad24(ou_ch_blk_id, in_w, col_id);
+  int in_y = mad24(batch_id, in_h, row_id);
 
-    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
-    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
-    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+  int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
+  int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
+  inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
+  int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
+  inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
+  int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
+  inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
+  int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
+  inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
 
+  output[0] = mad(inputs[0], filters[0], output[0]);
+  output[1] = mad(inputs[1], filters[0], output[1]);
 
-    int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
-    inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
-    inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
-    inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
-    inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
+  output[0] = mad(inputs[1], filters[1], output[0]);
+  output[1] = mad(inputs[2], filters[1], output[1]);
 
+  output[0] = mad(inputs[2], filters[2], output[0]);
+  output[1] = mad(inputs[3], filters[2], output[1]);
 
-    output[0] = mad(inputs[4], filters[3], output[0]);
-    output[1] = mad(inputs[5], filters[3], output[1]);
+  filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
+  filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
+  filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
 
-    output[0] = mad(inputs[5], filters[4], output[0]);
-    output[1] = mad(inputs[6], filters[4], output[1]);
+  int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
+  inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
+  inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
+  inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
+  inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
 
-    output[0] = mad(inputs[6], filters[5], output[0]);
-    output[1] = mad(inputs[7], filters[5], output[1]);
+  output[0] = mad(inputs[4], filters[3], output[0]);
+  output[1] = mad(inputs[5], filters[3], output[1]);
 
+  output[0] = mad(inputs[5], filters[4], output[0]);
+  output[1] = mad(inputs[6], filters[4], output[1]);
 
-    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
-    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
-    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+  output[0] = mad(inputs[6], filters[5], output[0]);
+  output[1] = mad(inputs[7], filters[5], output[1]);
 
-    int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
-    inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
-    inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
-    inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
-    inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
+  filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
+  filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
+  filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
 
+  int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
+  inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
+  inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
+  inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
+  inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
 
-    output[0] = mad(inputs[8], filters[6], output[0]);
-    output[1] = mad(inputs[9], filters[6], output[1]);
+  output[0] = mad(inputs[8], filters[6], output[0]);
+  output[1] = mad(inputs[9], filters[6], output[1]);
 
-    output[0] = mad(inputs[9], filters[7], output[0]);
-    output[1] = mad(inputs[10], filters[7], output[1]);
+  output[0] = mad(inputs[9], filters[7], output[0]);
+  output[1] = mad(inputs[10], filters[7], output[1]);
 
-    output[0] = mad(inputs[10], filters[8], output[0]);
-    output[1] = mad(inputs[11], filters[8], output[1]);
+  output[0] = mad(inputs[10], filters[8], output[0]);
+  output[1] = mad(inputs[11], filters[8], output[1]);
 #ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (ou_col_id + 1 < ou_w) {
-        output[1] = mad(scale, output[1], biase);
-    }
+  half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
+  half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
+  output[0] = mad(scale, output[0], biase);
+  if (ou_col_id + 1 < ou_w) {
+    output[1] = mad(scale, output[1], biase);
+  }
 #endif
 
 #ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
+  output[0] = activation(output[0]);
+  output[1] = activation(output[1]);
 #endif
 
-    write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
-    if (ou_col_id + 1 < ou_w) {
-        write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
-    }
-
+  write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
+  if (ou_col_id + 1 < ou_w) {
+    write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
+  }
 }
 
-__kernel void conv_1x1(__private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-                       __read_only image2d_t input_image,
-                       __read_only image2d_t filter,
+__kernel void conv_1x1(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                       __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                       __read_only image2d_t new_scale,
-                       __read_only image2d_t new_biase,
-#endif
-                       __write_only image2d_t output_image,
-                       __private const int stride,
-                       __private const int offset,
-                       __private const int input_c,
-                       __private const int dilation,
-                       __private const int input_width,/* of one block */
-                       __private const int input_height,/* of one block */
-                       __private const int output_width,
-                       __private const int output_height) {
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
 
   int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                           CLK_ADDRESS_CLAMP         |
-                           CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   const uint kernelHXW = 1;
   int2 stride_xy = (int2)(stride, stride);
   int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
 
 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
+  half4 output = 0.0f;
 #endif
 
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        half4 input = read_imageh(input_image, sampler, pos_in);
-
-        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-/*
-        output.x = dot(input, weight0);
-        output.y = dot(input, weight1);
-        output.z = dot(input, weight2);
-        output.w = dot(input, weight3);
-*/
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    half4 input = read_imageh(input_image, sampler, pos_in);
 
-        output = mad(input.x, weight0, output);
-        output = mad(input.y, weight1, output);
-        output = mad(input.z, weight2, output);
-        output = mad(input.w, weight3, output);
+    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+    /*
+            output.x = dot(input, weight0);
+            output.y = dot(input, weight1);
+            output.z = dot(input, weight2);
+            output.w = dot(input, weight3);
+    */
 
-   }
+    output = mad(input.x, weight0, output);
+    output = mad(input.y, weight1, output);
+    output = mad(input.z, weight2, output);
+    output = mad(input.w, weight3, output);
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
@@ -1017,14 +1138,12 @@ __kernel void conv_1x1_simple(
     __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
     __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,__private const int input_c_origin,
-    __private const int dilation,
+    __private const int offset, __private const int input_c,
+    __private const int input_c_origin, __private const int dilation,
     __private const int input_width,  /* of one block */
     __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w
-) {
+    __private const int output_width, __private const int output_height,
+    __private const int old_w) {
   half zero = 0.0f;
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
@@ -1035,7 +1154,7 @@ __kernel void conv_1x1_simple(
   int out_w2 = out_w + global_size_dim1 * 2;
   int out_w3 = out_w + global_size_dim1 * 3;
 
-  int outpos_main = mul24(out_c , old_w);
+  int outpos_main = mul24(out_c, old_w);
   int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
   int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
   int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
@@ -1064,14 +1183,14 @@ __kernel void conv_1x1_simple(
 
 #ifdef BIASE_CH
   half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output1 = output0;
-    half4 output2 = output0;
-    half4 output3 = output0;
+  half4 output1 = output0;
+  half4 output2 = output0;
+  half4 output3 = output0;
 #elif defined(BIASE_ELE)
   half4 output0 = read_imageh(bias, sampler, output_pos0);
-    half4 output1 = output0;
-    half4 output2 = output0;
-    half4 output3 = output0;
+  half4 output1 = output0;
+  half4 output2 = output0;
+  half4 output3 = output0;
 
 #else
   half4 output0 = 0.0f;
@@ -1082,7 +1201,8 @@ __kernel void conv_1x1_simple(
 
   for (int i = 0; i < input_c; ++i) {
     // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
     half4 input0 = read_imageh(input_image, sampler, pos_in);
 
     half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
@@ -1095,7 +1215,8 @@ __kernel void conv_1x1_simple(
     output0 = mad(input0.z, weight2, output0);
     output0 = mad(input0.w, weight3, output0);
     // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
     half4 input1 = read_imageh(input_image, sampler, pos_in);
 
     output1 = mad(input1.x, weight0, output1);
@@ -1104,7 +1225,8 @@ __kernel void conv_1x1_simple(
     output1 = mad(input1.w, weight3, output1);
 
     // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
     half4 input2 = read_imageh(input_image, sampler, pos_in);
 
     output2 = mad(input2.x, weight0, output2);
@@ -1113,7 +1235,8 @@ __kernel void conv_1x1_simple(
     output2 = mad(input2.w, weight3, output2);
 
     // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
     half4 input3 = read_imageh(input_image, sampler, pos_in);
 
     output3 = mad(input3.x, weight0, output3);
@@ -1124,38 +1247,38 @@ __kernel void conv_1x1_simple(
 
 #ifdef BATCH_NORM
   output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
   output0 = activation(output0);
-     output1 = activation(output1);
-     output2 = activation(output2);
-     output3 = activation(output3);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
 #endif
 
   if (out_w0 < old_w) {
     write_imageh(output_image, output_pos0, output0);
   }
 
-  if (out_w1 < old_w){
+  if (out_w1 < old_w) {
     write_imageh(output_image, output_pos1, output1);
   }
 
-  if (out_w2 < old_w){
+  if (out_w2 < old_w) {
     write_imageh(output_image, output_pos2, output2);
   }
 
-  if (out_w3 < old_w){
+  if (out_w3 < old_w) {
     write_imageh(output_image, output_pos3, output3);
   }
 }
@@ -1170,15 +1293,13 @@ __kernel void conv_1x1_wrapped(
     __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
     __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,__private const int input_c_origin,
-    __private const int dilation,
+    __private const int offset, __private const int input_c,
+    __private const int input_c_origin, __private const int dilation,
     __private const int input_width,  /* of one block */
     __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w
-    ) {
-  half zero = 0.0f;
+    __private const int output_width, __private const int output_height,
+    __private const int old_w) {
+
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -1188,7 +1309,7 @@ __kernel void conv_1x1_wrapped(
   int out_w2 = out_w + global_size_dim1 * 2;
   int out_w3 = out_w + global_size_dim1 * 3;
 
-  int outpos_main = mul24(out_c , old_w);
+  int outpos_main = mul24(out_c, old_w);
   int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
   int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
   int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
@@ -1216,15 +1337,15 @@ __kernel void conv_1x1_wrapped(
       ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
 
 #ifdef BIASE_CH
-    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output1 = output0;
-    half4 output2 = output0;
-    half4 output3 = output0;
+  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output0 = read_imageh(bias, sampler, output_pos0);
-    half4 output1 = output0;
-    half4 output2 = output0;
-    half4 output3 = output0;
+  half4 output0 = read_imageh(bias, sampler, output_pos0);
+  half4 output1 = read_imageh(bias, sampler, output_pos1);
+  half4 output2 = read_imageh(bias, sampler, output_pos2);
+  half4 output3 = read_imageh(bias, sampler, output_pos3);
 
 #else
   half4 output0 = 0.0f;
@@ -1235,1088 +1356,1209 @@ __kernel void conv_1x1_wrapped(
 
   int max_w_bound = input_c * input_width;
   int burndary_index = input_c * 4 - input_c_origin;
-  bool burndary_index_w = burndary_index==1||burndary_index==2||burndary_index==3;
-  bool burndary_index_z = burndary_index==2||burndary_index==3;
-  bool burndary_index_y = burndary_index==3;
-
   for (int i = 0; i < input_c; ++i) {
     // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
     half4 input0 = read_imageh(input_image, sampler, pos_in);
 
     half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
     half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
     half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
     half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-    int bound_gap = max_w_bound - pos_in.x - 1;
 
-    bool outof_bound =  bound_gap < input_width && bound_gap >= 0;
-    input0.w = select(input0.w,zero,outof_bound && burndary_index_w);
-    input0.z = select(input0.z,zero,outof_bound && burndary_index_z);
-    input0.y = select(input0.y,zero,outof_bound && burndary_index_y);
+    if ((max_w_bound - pos_in.x - 1) < input_width &&
+        (max_w_bound - pos_in.x - 1) >= 0) {
+      if (burndary_index == 0) {
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(input0.y, weight1, output0);
+        output0 = mad(input0.z, weight2, output0);
+        output0 = mad(input0.w, weight3, output0);
+      } else if (burndary_index == 1) {
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(input0.y, weight1, output0);
+        output0 = mad(input0.z, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);
+
+      } else if (burndary_index == 2) {
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(input0.y, weight1, output0);
+        output0 = mad(0.0f, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);
+      } else if (burndary_index == 3) {
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(0.0f, weight1, output0);
+        output0 = mad(0.0f, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);
+      }
+    } else {
+      output0 = mad(input0.x, weight0, output0);
+      output0 = mad(input0.y, weight1, output0);
+      output0 = mad(input0.z, weight2, output0);
+      output0 = mad(input0.w, weight3, output0);
+    }
 
-    output0 = mad(input0.x, weight0, output0);
-    output0 = mad(input0.y, weight1, output0);
-    output0 = mad(input0.z, weight2, output0);
-    output0 = mad(input0.w, weight3, output0);
     // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
     half4 input1 = read_imageh(input_image, sampler, pos_in);
 
-    bound_gap = max_w_bound - pos_in.x - 1;
-
-    outof_bound =  bound_gap < input_width && bound_gap >= 0;
-    input1.w = select(input1.w,zero,outof_bound && burndary_index_w);
-    input1.z = select(input1.z,zero,outof_bound && burndary_index_z);
-    input1.y = select(input1.y,zero,outof_bound && burndary_index_y);
-
-    output1 = mad(input1.x, weight0, output1);
-    output1 = mad(input1.y, weight1, output1);
-    output1 = mad(input1.z, weight2, output1);
-    output1 = mad(input1.w, weight3, output1);
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(input1.z, weight2, output1);
+        output1 = mad(input1.w, weight3, output1);
+      } else if (burndary_index == 1) {
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(input1.z, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+
+      } else if (burndary_index == 2) {
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(0.0f, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+      } else if (burndary_index == 3) {
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(0.0f, weight1, output1);
+        output1 = mad(0.0f, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+      }
+    } else {
+      output1 = mad(input1.x, weight0, output1);
+      output1 = mad(input1.y, weight1, output1);
+      output1 = mad(input1.z, weight2, output1);
+      output1 = mad(input1.w, weight3, output1);
+    }
 
     // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
     half4 input2 = read_imageh(input_image, sampler, pos_in);
 
-    bound_gap = max_w_bound - pos_in.x - 1;
-
-    outof_bound =  bound_gap < input_width && bound_gap >= 0;
-    input2.w = select(input2.w,zero,outof_bound && burndary_index_w);
-    input2.z = select(input2.z,zero,outof_bound && burndary_index_z);
-    input2.y = select(input2.y,zero,outof_bound && burndary_index_y);
-
-    output2 = mad(input2.x, weight0, output2);
-    output2 = mad(input2.y, weight1, output2);
-    output2 = mad(input2.z, weight2, output2);
-    output2 = mad(input2.w, weight3, output2);
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(input2.z, weight2, output2);
+        output2 = mad(input2.w, weight3, output2);
+      } else if (burndary_index == 1) {
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(input2.z, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);
+
+      } else if (burndary_index == 2) {
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(0.0f, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);
+      } else if (burndary_index == 3) {
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(0.0f, weight1, output2);
+        output2 = mad(0.0f, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);
+      }
+    } else {
+      output2 = mad(input2.x, weight0, output2);
+      output2 = mad(input2.y, weight1, output2);
+      output2 = mad(input2.z, weight2, output2);
+      output2 = mad(input2.w, weight3, output2);
+    }
 
     // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
     half4 input3 = read_imageh(input_image, sampler, pos_in);
-    bound_gap = max_w_bound - pos_in.x - 1;
-
-    outof_bound =  bound_gap < input_width && bound_gap >= 0;
-    input3.w = select(input3.w,zero,outof_bound && (burndary_index==1||burndary_index==2||burndary_index==3));
-    input3.z = select(input3.z,zero,outof_bound && (burndary_index==2||burndary_index==3));
-    input3.y = select(input3.y,zero,outof_bound && burndary_index==3);
 
-    output3 = mad(input3.x, weight0, output3);
-    output3 = mad(input3.y, weight1, output3);
-    output3 = mad(input3.z, weight2, output3);
-    output3 = mad(input3.w, weight3, output3);
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(input3.y, weight1, output3);
+        output3 = mad(input3.z, weight2, output3);
+        output3 = mad(input3.w, weight3, output3);
+      } else if (burndary_index == 1) {
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(input3.y, weight1, output3);
+        output3 = mad(input3.z, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);
+
+      } else if (burndary_index == 2) {
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(input3.y, weight1, output3);
+        output3 = mad(0.0f, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);
+      } else if (burndary_index == 3) {
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(0.0f, weight1, output3);
+        output3 = mad(0.0f, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);
+      }
+    } else {
+      output3 = mad(input3.x, weight0, output3);
+      output3 = mad(input3.y, weight1, output3);
+      output3 = mad(input3.z, weight2, output3);
+      output3 = mad(input3.w, weight3, output3);
+    }
   }
 
 #ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
-     output0 = activation(output0);
-     output1 = activation(output1);
-     output2 = activation(output2);
-     output3 = activation(output3);
+  output0 = activation(output0);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
 #endif
 
   if (out_w0 < old_w) {
     write_imageh(output_image, output_pos0, output0);
   }
 
-  if (out_w1 < old_w){
+  if (out_w1 < old_w) {
     write_imageh(output_image, output_pos1, output1);
   }
 
-  if (out_w2 < old_w){
+  if (out_w2 < old_w) {
     write_imageh(output_image, output_pos2, output2);
   }
 
-  if (out_w3 < old_w){
+  if (out_w3 < old_w) {
     write_imageh(output_image, output_pos3, output3);
   }
 }
 
-__kernel void conv_7x7(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter_image,
-
+__kernel void conv_7x7(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-    const int filter_n0 = 4 * out_c + 0;
-    const int filter_n1 = 4 * out_c + 1;
-    const int filter_n2 = 4 * out_c + 2;
-    const int filter_n3 = 4 * out_c + 3;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const int filter_n0 = 4 * out_c + 0;
+  const int filter_n1 = 4 * out_c + 1;
+  const int filter_n2 = 4 * out_c + 2;
+  const int filter_n3 = 4 * out_c + 3;
 
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
 
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 
 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
-#endif
-
-   half4 input;
-   half4 filter[4];
-   int2 filter_pos0;
-   int2 filter_pos1;
-   int2 filter_pos2;
-   int2 filter_pos3;
-   for (int i = 0; i < input_c; ++i) {
-   int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        for(int j = 0; j < 7; j++){
-         for(int k = 0; k < 7; k++){
-          input  =  select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x + (j - 3) * dilation, pos_in.y +  (k - 3) * dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
-         int filter_h = k;
-         int filter_w = j;
-         int filter_c = i;
-
-         filter_pos0.x = filter_c * 7 + filter_w;
-         filter_pos0.y = filter_n0 * 7 + filter_h;
-
-         filter_pos1.x = filter_c * 7 + filter_w;
-         filter_pos1.y = filter_n1 * 7 + filter_h;
-
-         filter_pos2.x = filter_c * 7 + filter_w;
-         filter_pos2.y = filter_n2 * 7 + filter_h;
-
-         filter_pos3.x = filter_c * 7 + filter_w;
-         filter_pos3.y = filter_n3 * 7 + filter_h;
-
-         filter[0] =  read_imageh(filter_image, sampler, filter_pos0);
-         filter[1] =  read_imageh(filter_image, sampler, filter_pos1);
-         filter[2] =  read_imageh(filter_image, sampler, filter_pos2);
-         filter[3] =  read_imageh(filter_image, sampler, filter_pos3);
-
-         output.x += dot(input, filter[0]);
-         output.y += dot(input, filter[1]);
-         output.z += dot(input, filter[2]);
-         output.w += dot(input, filter[3]);
-         }
-        }
+  half4 output = 0.0f;
+#endif
+
+  half4 input;
+  half4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    for (int j = 0; j < 7; j++) {
+      for (int k = 0; k < 7; k++) {
+        input = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + (j - 3) * dilation,
+                               pos_in.y + (k - 3) * dilation)),
+            (half4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        filter_pos0.x = filter_c * 7 + filter_w;
+        filter_pos0.y = filter_n0 * 7 + filter_h;
+
+        filter_pos1.x = filter_c * 7 + filter_w;
+        filter_pos1.y = filter_n1 * 7 + filter_h;
+
+        filter_pos2.x = filter_c * 7 + filter_w;
+        filter_pos2.y = filter_n2 * 7 + filter_h;
+
+        filter_pos3.x = filter_c * 7 + filter_w;
+        filter_pos3.y = filter_n3 * 7 + filter_h;
+
+        filter[0] = read_imageh(filter_image, sampler, filter_pos0);
+        filter[1] = read_imageh(filter_image, sampler, filter_pos1);
+        filter[2] = read_imageh(filter_image, sampler, filter_pos2);
+        filter[3] = read_imageh(filter_image, sampler, filter_pos3);
+
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+      }
     }
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif
 
-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }
 
-__kernel void conv_7x7Pt1x2(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter_image,
-
+__kernel void conv_7x7Pt1x2(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w1 = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    if (out_c >= global_size_dim0 ||
-        out_w1 >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-    const int out_w = out_w1 * 2;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
 
-    int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
+  const int out_c = get_global_id(0);
+  const int out_w1 = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= global_size_dim0 || out_w1 >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const int out_w = out_w1 * 2;
 
-    const int filter_n0 = 4 * out_c + 0;
-    const int filter_n1 = 4 * out_c + 1;
-    const int filter_n2 = 4 * out_c + 2;
-    const int filter_n3 = 4 * out_c + 3;
+  int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
 
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  const int filter_n0 = 4 * out_c + 0;
+  const int filter_n1 = 4 * out_c + 1;
+  const int filter_n2 = 4 * out_c + 2;
+  const int filter_n3 = 4 * out_c + 3;
 
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
 
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 
-    half4 output0 = 0.0f;
-    half4 output1 = 0.0f;
+  half4 output0 = 0.0f;
+  half4 output1 = 0.0f;
 #ifdef BIASE_CH
-    output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    output1 = output0;
+  output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  output1 = output0;
 #elif defined(BIASE_ELE)
-    output0 = read_imageh(bias, sampler, output_pos);
-    output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
+  output0 = read_imageh(bias, sampler, output_pos);
+  output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
 #else
-    output0 = 0.0f;
-    output1 = 0.0f;
-#endif
-
-   half4 input[8];
-   half4 filter0[4];
-   half4 filter1[4];
-   half4 filter2[4];
-   half4 filter3[4];
-   int2 filter_pos0;
-   int2 filter_pos1;
-   int2 filter_pos2;
-   int2 filter_pos3;
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        for(int k = 0; k < 7; k++){
-         for (int j = 0; j < 8; j++) {
-             input[j]  =  select(read_imageh(input_image, sampler,
-                                                                  (int2)(pos_in.x + (j - 3) * dilation, pos_in.y +  (k - 3) * dilation)),
-                                                                  (half4)(0.0f),
-                                                                  (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
-
-             int filter_h = k;
-             int filter_w = j;
-             int filter_c = i;
-
-             if (j < 7) {
-                filter_pos0.x = filter_c * 7 + filter_w;
-                filter_pos0.y = filter_n0 * 7 + filter_h;
-
-                filter_pos1.x = filter_c * 7 + filter_w;
-                filter_pos1.y = filter_n1 * 7 + filter_h;
-
-                filter_pos2.x = filter_c * 7 + filter_w;
-                filter_pos2.y = filter_n2 * 7 + filter_h;
-
-                filter_pos3.x = filter_c * 7 + filter_w;
-                filter_pos3.y = filter_n3 * 7 + filter_h;
-
-                filter0[0] =  read_imageh(filter_image, sampler, filter_pos0);
-                filter0[1] =  read_imageh(filter_image, sampler, filter_pos1);
-                filter0[2] =  read_imageh(filter_image, sampler, filter_pos2);
-                filter0[3] =  read_imageh(filter_image, sampler, filter_pos3);
-
-                output0.x += dot(input[j], filter0[0]);
-                output0.y += dot(input[j], filter0[1]);
-                output0.z += dot(input[j], filter0[2]);
-                output0.w += dot(input[j], filter0[3]);
-             }
-
-             if (j > 0) {
-               output1.x += dot(input[j], filter1[0]);
-               output1.y += dot(input[j], filter1[1]);
-               output1.z += dot(input[j], filter1[2]);
-               output1.w += dot(input[j], filter1[3]);
-             }
-
-             filter1[0] = filter0[0];
-             filter1[1] = filter0[1];
-             filter1[2] = filter0[2];
-             filter1[3] = filter0[3];
-         }
+  output0 = 0.0f;
+  output1 = 0.0f;
+#endif
+
+  half4 input[8];
+  half4 filter0[4];
+  half4 filter1[4];
+  half4 filter2[4];
+  half4 filter3[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    for (int k = 0; k < 7; k++) {
+      for (int j = 0; j < 8; j++) {
+        input[j] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + (j - 3) * dilation,
+                               pos_in.y + (k - 3) * dilation)),
+            (half4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
+                << 15));
+
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        if (j < 7) {
+          filter_pos0.x = filter_c * 7 + filter_w;
+          filter_pos0.y = filter_n0 * 7 + filter_h;
+
+          filter_pos1.x = filter_c * 7 + filter_w;
+          filter_pos1.y = filter_n1 * 7 + filter_h;
+
+          filter_pos2.x = filter_c * 7 + filter_w;
+          filter_pos2.y = filter_n2 * 7 + filter_h;
+
+          filter_pos3.x = filter_c * 7 + filter_w;
+          filter_pos3.y = filter_n3 * 7 + filter_h;
+
+          filter0[0] = read_imageh(filter_image, sampler, filter_pos0);
+          filter0[1] = read_imageh(filter_image, sampler, filter_pos1);
+          filter0[2] = read_imageh(filter_image, sampler, filter_pos2);
+          filter0[3] = read_imageh(filter_image, sampler, filter_pos3);
+
+          output0.x += dot(input[j], filter0[0]);
+          output0.y += dot(input[j], filter0[1]);
+          output0.z += dot(input[j], filter0[2]);
+          output0.w += dot(input[j], filter0[3]);
+        }
+
+        if (j > 0) {
+          output1.x += dot(input[j], filter1[0]);
+          output1.y += dot(input[j], filter1[1]);
+          output1.z += dot(input[j], filter1[2]);
+          output1.w += dot(input[j], filter1[3]);
         }
-   }
+
+        filter1[0] = filter0[0];
+        filter1[1] = filter0[1];
+        filter1[2] = filter0[2];
+        filter1[3] = filter0[3];
+      }
+    }
+  }
 
 #ifdef BATCH_NORM
-    half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
-    half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
-    output0 = output0 * s + b;
-    output1 = output1 * s + b;
+  half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
+  half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output0 = output0 * s + b;
+  output1 = output1 * s + b;
 #endif
 
 #ifdef RELU
-    output0 = activation(output0);
-    output1 = activation(output1);
+  output0 = activation(output0);
+  output1 = activation(output1);
 #endif
-    write_imageh(output_image, output_pos, output0);
-    if ((output_pos.x + 1) % output_width != 0) {
-      write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
-    }
+  write_imageh(output_image, output_pos, output0);
+  if ((output_pos.x + 1) % output_width != 0) {
+    write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
+  }
 }
 
 // dilation == 1
-__kernel void conv_7x7spl(__private const int item_ch,
-                          __private const int item_w,
-                          __private const int item_h,
-                          __read_only image2d_t input_image,
-                          __read_only image2d_t filter_image,
+__kernel void conv_7x7spl(
+    __private const int item_ch, __private const int item_w,
+    __private const int item_h, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                          __write_only image2d_t output_image,
-                          __private const int stride,
-                          __private const int pad,
-                          __private const int dilation,
-                          __private const int in_ch,
-                          __private const int in_w,
-                          __private const int in_h,
-                          __private const int out_w,
-                          __private const int out_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-    // filter
-    const int filter_w = 7;
-    const int filter_h = 7;
-
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_width_id_per_blk and out_batch_id
-    int out_batch_id = item_h_id / in_h;
-    int out_w_base_id = item_ch_id * out_w;
-    int out_w_id0 = item_w_id;
-    int out_w_id1 = out_w_id0 + item_w;
-    int out_w_id2 = out_w_id1 + item_w;
-    int out_w_id3 = out_w_id2 + item_w;
-    int out_w_id4 = out_w_id3 + item_w;
-
-    // in_width_id_per_blk and in_height_id_per_batch
-    int in_h_id = (item_h_id % out_h) * stride - pad;
-    int in_w_id0 = item_w_id * stride - pad;
-    int in_w_id1 = in_w_id0 + item_w * stride;
-    int in_w_id2 = in_w_id1 + item_w * stride;
-    int in_w_id3 = in_w_id2 + item_w * stride;
-    int in_w_id4 = in_w_id3 + item_w * stride;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int pad, __private const int dilation,
+    __private const int in_ch, __private const int in_w,
+    __private const int in_h, __private const int out_w,
+    __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 7;
+  const int filter_h = 7;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
 
 #ifdef BIASE_CH
 
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-    output[1] = output[0];
-    output[2] = output[0];
-    output[3] = output[0];
-    output[4] = output[0];
+  half4 output[5];
+  output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
 
 #elif defined(BIASE_ELE)
 
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
-    if (out_w_id1 < out_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
-    }
-    if (out_w_id2 < out_w) {
-        output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
-    }
-    if (out_w_id3 < out_w) {
-        output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
-    }
-    if (out_w_id4 < out_w) {
-        output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
-    }
+  half4 output[5];
+  output[0] =
+      read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
 #else
-    half4 output[5] = {0.0f};
-#endif
-
-    half4 filter[4] = {0.0f};
-    half4 filter_trans[4] = {0.0f};
-    half4 input[5] = {0.0f};
-
-    int filter_h_val0 = item_ch_id * 4 * filter_h;
-    int filter_h_val1 = filter_h_val0 + filter_h;
-    int filter_h_val2 = filter_h_val1 + filter_h;
-    int filter_h_val3 = filter_h_val2 + filter_h;
-
-    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-        int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
-
-        const int in_w_base_id = mul24(ch, in_w);
-
-        int filter_w_val = ch * filter_w;
-
-        for (int h = 0; h < filter_h; h++) {
-
-            int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
-                                  (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
-
-            for (int w = 0; w < filter_w; w++) {
-
-                int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
-                                       (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-                int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
-                                       (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-                int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
-                                       (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-                int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
-                                       (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-                int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
-                                       (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
-
-                filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
-                filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
-                filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
-                filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
-
-                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);    // in_ch:0,out_ch:0-3
-                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);    // in_ch:1,out_ch:0-3
-                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);    // in_ch:2,out_ch:0-3
-                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);    // in_ch:3,out_ch:0-3
-
-                input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
-                input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
-                input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
-                input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
-                input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
-
-                output[0] = mad(input[0].x, filter_trans[0], output[0]);
-                output[1] = mad(input[1].x, filter_trans[0], output[1]);
-                output[2] = mad(input[2].x, filter_trans[0], output[2]);
-                output[3] = mad(input[3].x, filter_trans[0], output[3]);
-                output[4] = mad(input[4].x, filter_trans[0], output[4]);
-
-                if (ch_surplus < 3) {
-                    output[0] = mad(input[0].y, filter_trans[1], output[0]);
-                    output[1] = mad(input[1].y, filter_trans[1], output[1]);
-                    output[2] = mad(input[2].y, filter_trans[1], output[2]);
-                    output[3] = mad(input[3].y, filter_trans[1], output[3]);
-                    output[4] = mad(input[4].y, filter_trans[1], output[4]);
-                }
-                if (ch_surplus < 2) {
-                    output[0] = mad(input[0].z, filter_trans[2], output[0]);
-                    output[1] = mad(input[1].z, filter_trans[2], output[1]);
-                    output[2] = mad(input[2].z, filter_trans[2], output[2]);
-                    output[3] = mad(input[3].z, filter_trans[2], output[3]);
-                    output[4] = mad(input[4].z, filter_trans[2], output[4]);
-                }
-                if (ch_surplus < 1) {
-                    output[0] = mad(input[0].w, filter_trans[3], output[0]);
-                    output[1] = mad(input[1].w, filter_trans[3], output[1]);
-                    output[2] = mad(input[2].w, filter_trans[3], output[2]);
-                    output[3] = mad(input[3].w, filter_trans[3], output[3]);
-                    output[4] = mad(input[4].w, filter_trans[3], output[4]);
-                }
-            }
+  half4 output[5] = {0.0f};
+#endif
+
+  half4 filter[4] = {0.0f};
+  half4 filter_trans[4] = {0.0f};
+  half4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
+                            (out_batch_id * in_h + in_h_id + h < 0 ||
+                             out_batch_id * in_h + in_h_id + h >= in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
+                                  filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
+                                  filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
+                                  filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
+                                  filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] =
+            read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] =
+            read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] =
+            read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] =
+            read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] =
+            read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
         }
+      }
     }
+  }
 #ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (out_w_id1 < out_w) {
-        output[1] =  mad(scale, output[1], biase);
-    }
-    if (out_w_id2 < out_w) {
-        output[2] =  mad(scale, output[2], biase);
-    }
-    if (out_w_id3 < out_w) {
-        output[3] =  mad(scale, output[3], biase);
-    }
-    if (out_w_id4 < out_w) {
-        output[4] =  mad(scale, output[4], biase);
-    }
+  half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
+  half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
+  output[0] = mad(scale, output[0], biase);
+  if (out_w_id1 < out_w) {
+    output[1] = mad(scale, output[1], biase);
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = mad(scale, output[2], biase);
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = mad(scale, output[3], biase);
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = mad(scale, output[4], biase);
+  }
 #endif
 
 #ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
-    output[2] = activation(output[2]);
-    output[3] = activation(output[3]);
-    output[4] = activation(output[4]);
-#endif
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
-    if (out_w_id1 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
-    }
-    if (out_w_id2 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
-    }
-    if (out_w_id3 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
-    }
-    if (out_w_id4 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
-    }
+  output[0] = activation(output[0]);
+  output[1] = activation(output[1]);
+  output[2] = activation(output[2]);
+  output[3] = activation(output[3]);
+  output[4] = activation(output[4]);
+#endif
+  write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
+               output[0]);
+  if (out_w_id1 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
+                 output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
+                 output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
+                 output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
+                 output[4]);
+  }
 }
 
-__kernel void conv_5x5(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter_image,
-
+__kernel void conv_5x5(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-    const filter_n0 = 4 * out_c + 0;
-    const filter_n1 = 4 * out_c + 1;
-    const filter_n2 = 4 * out_c + 2;
-    const filter_n3 = 4 * out_c + 3;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const filter_n0 = 4 * out_c + 0;
+  const filter_n1 = 4 * out_c + 1;
+  const filter_n2 = 4 * out_c + 2;
+  const filter_n3 = 4 * out_c + 3;
 
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
 
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 
 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
-#endif
-
-   half4 input;
-   half4 filter[4];
-   int2 filter_pos0;
-   int2 filter_pos1;
-   int2 filter_pos2;
-   int2 filter_pos3;
-   for (int i = 0; i < input_c; ++i) {
-   int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        for(int j = 0; j < 5; j++){
-         for(int k = 0; k < 5; k++){
-          input  =  select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x + (j - 2) * dilation, pos_in.y +  (k - 2) * dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15));
-         int filter_h = k;
-         int filter_w = j;
-         int filter_c = i;
-
-         filter_pos0.x = filter_c * 5 + filter_w;
-         filter_pos0.y = filter_n0 * 5 + filter_h;
-
-         filter_pos1.x = filter_c * 5 + filter_w;
-         filter_pos1.y = filter_n1 * 5 + filter_h;
-
-         filter_pos2.x = filter_c * 5 + filter_w;
-         filter_pos2.y = filter_n2 * 5 + filter_h;
-
-         filter_pos3.x = filter_c * 5 + filter_w;
-         filter_pos3.y = filter_n3 * 5 + filter_h;
-
-         filter[0] =  read_imageh(filter_image, sampler, filter_pos0);
-         filter[1] =  read_imageh(filter_image, sampler, filter_pos1);
-         filter[2] =  read_imageh(filter_image, sampler, filter_pos2);
-         filter[3] =  read_imageh(filter_image, sampler, filter_pos3);
-
-         output.x += dot(input, filter[0]);
-         output.y += dot(input, filter[1]);
-         output.z += dot(input, filter[2]);
-         output.w += dot(input, filter[3]);
-         }
-        }
+  half4 output = 0.0f;
+#endif
+
+  half4 input;
+  half4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    for (int j = 0; j < 5; j++) {
+      for (int k = 0; k < 5; k++) {
+        input = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + (j - 2) * dilation,
+                               pos_in.y + (k - 2) * dilation)),
+            (half4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        filter_pos0.x = filter_c * 5 + filter_w;
+        filter_pos0.y = filter_n0 * 5 + filter_h;
+
+        filter_pos1.x = filter_c * 5 + filter_w;
+        filter_pos1.y = filter_n1 * 5 + filter_h;
+
+        filter_pos2.x = filter_c * 5 + filter_w;
+        filter_pos2.y = filter_n2 * 5 + filter_h;
+
+        filter_pos3.x = filter_c * 5 + filter_w;
+        filter_pos3.y = filter_n3 * 5 + filter_h;
+
+        filter[0] = read_imageh(filter_image, sampler, filter_pos0);
+        filter[1] = read_imageh(filter_image, sampler, filter_pos1);
+        filter[2] = read_imageh(filter_image, sampler, filter_pos2);
+        filter[3] = read_imageh(filter_image, sampler, filter_pos3);
+
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+      }
     }
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif
 
-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }
 
-__kernel void convBNAdd_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter,
-
+__kernel void convBNAdd_3x3(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-
-
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
-
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
-
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
 
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
 
-    half4 output = (half4)0.0f;
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
 
-   half4 input[9];
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        input[0] = select(read_imageh(input_image, sampler,
-                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                            (half4)(0.0f),
-                            (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
 
-        input[1] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y - dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
 
-        input[2] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
 
-        input[3] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x - dilation, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-        input[4] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 
-        input[5] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+  half4 output = (half4)0.0f;
 
-        input[6] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+  half4 input[9];
 
-        input[7] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    input[0] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                          in_pos_in_one_block.y - dilation < 0 ||
+                          in_pos_in_one_block.x - dilation >= input_width ||
+                          in_pos_in_one_block.y - dilation >= input_height)
+                         << 15));
+
+    input[1] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x, pos_in.y - dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x < 0 ||
+                          in_pos_in_one_block.y - dilation < 0 ||
+                          in_pos_in_one_block.x >= input_width ||
+                          in_pos_in_one_block.y - dilation >= input_height)
+                         << 15));
+
+    input[2] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                          in_pos_in_one_block.y - dilation < 0 ||
+                          in_pos_in_one_block.x + dilation >= input_width ||
+                          in_pos_in_one_block.y - dilation >= input_height)
+                         << 15));
+
+    input[3] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x - dilation, pos_in.y)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                          in_pos_in_one_block.y < 0 ||
+                          in_pos_in_one_block.x - dilation >= input_width ||
+                          in_pos_in_one_block.y >= input_height)
+                         << 15));
+
+    input[4] = select(
+        read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+        (half4)(0.0f),
+        (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                   in_pos_in_one_block.x >= input_width ||
+                   in_pos_in_one_block.y >= input_height)
+                  << 15));
+
+    input[5] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x + dilation, pos_in.y)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                          in_pos_in_one_block.y < 0 ||
+                          in_pos_in_one_block.x + dilation >= input_width ||
+                          in_pos_in_one_block.y >= input_height)
+                         << 15));
+
+    input[6] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                          in_pos_in_one_block.y + dilation < 0 ||
+                          in_pos_in_one_block.x - dilation >= input_width ||
+                          in_pos_in_one_block.y + dilation >= input_height)
+                         << 15));
+
+    input[7] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x, pos_in.y + dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x < 0 ||
+                          in_pos_in_one_block.y + dilation < 0 ||
+                          in_pos_in_one_block.x >= input_width ||
+                          in_pos_in_one_block.y + dilation >= input_height)
+                         << 15));
+
+    input[8] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                          in_pos_in_one_block.y + dilation < 0 ||
+                          in_pos_in_one_block.x + dilation >= input_width ||
+                          in_pos_in_one_block.y + dilation >= input_height)
+                         << 15));
 
-        input[8] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+    /*
+            for (int j = 0; j < 9; ++j) {
+                int2 pos_of_weight;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
 
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
 
-/*
-        for (int j = 0; j < 9; ++j) {
-            int2 pos_of_weight;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-        }
-*/
-            int j = 0;
-            int2 pos_of_weight;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 1;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 2;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 3;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 4;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 5;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-           j = 6;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
-
-           j = 7;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
-
-           j = 8;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
 
-    }
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+            }
+    */
+    int j = 0;
+    int2 pos_of_weight;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 1;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 2;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 3;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 4;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 5;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 6;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 7;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 8;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef BIASE_CH
-    output += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output += read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    output += read_imageh(bias, sampler, output_pos);
+  output += read_imageh(bias, sampler, output_pos);
 #endif
 
 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif
 
-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }
 
-__kernel void convBNAdd_1x1(__private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-                       __read_only image2d_t input_image,
-                       __read_only image2d_t filter,
+__kernel void convBNAdd_1x1(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                       __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                       __read_only image2d_t new_scale,
-                       __read_only image2d_t new_biase,
-#endif
-                       __write_only image2d_t output_image,
-                       __private const int stride,
-                       __private const int offset,
-                       __private const int input_c,
-                       __private const int dilation,
-                       __private const int input_width,/* of one block */
-                       __private const int input_height,/* of one block */
-                       __private const int output_width,
-                       __private const int output_height) {
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
 
   int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                           CLK_ADDRESS_CLAMP         |
-                           CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   const uint kernelHXW = 1;
   int2 stride_xy = (int2)(stride, stride);
   int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
 
   half4 output = 0.0f;
 
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        half4 input = read_imageh(input_image, sampler, pos_in);
-
-        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-/*
-        output.x = dot(input, weight0);
-        output.y = dot(input, weight1);
-        output.z = dot(input, weight2);
-        output.w = dot(input, weight3);
-*/
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    half4 input = read_imageh(input_image, sampler, pos_in);
 
-        output = mad(input.x, weight0, output);
-        output = mad(input.y, weight1, output);
-        output = mad(input.z, weight2, output);
-        output = mad(input.w, weight3, output);
+    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+    /*
+            output.x = dot(input, weight0);
+            output.y = dot(input, weight1);
+            output.z = dot(input, weight2);
+            output.w = dot(input, weight3);
+    */
 
-   }
+    output = mad(input.x, weight0, output);
+    output = mad(input.y, weight1, output);
+    output = mad(input.z, weight2, output);
+    output = mad(input.w, weight3, output);
+  }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef BIASE_CH
-    output += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output += read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    output += read_imageh(bias, sampler, output_pos);
+  output += read_imageh(bias, sampler, output_pos);
 #endif
 
 #ifdef RELU
@@ -2327,24 +2569,22 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0,
 }
 
 __kernel void convBNAdd_1x1_spl(
-        __private const int global_size_dim0, __private const int global_size_dim1,
-        __private const int global_size_dim2, __read_only image2d_t input_image,
-        __read_only image2d_t filter,
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-        __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-        __write_only image2d_t output_image, __private const int stride,
-        __private const int offset, __private const int input_c,
-        __private const int dilation,
-        __private const int input_width,  /* of one block */
-        __private const int input_height, /* of one block */
-        __private const int output_width,
-        __private const int output_height,
-        __private const int old_w
-) {
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __private const int old_w) {
 
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
@@ -2355,33 +2595,32 @@ __kernel void convBNAdd_1x1_spl(
   int out_w2 = out_w + global_size_dim1 * 2;
   int out_w3 = out_w + global_size_dim1 * 3;
 
-  int outpos_main = mul24(out_c , old_w);
+  int outpos_main = mul24(out_c, old_w);
   int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
   int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
   int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
   int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
 
   const sampler_t sampler =
-          CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   int2 stride_xy = (int2)(stride, stride);
 
   int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
   int2 in_pos_in_one_block0 =
-          ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
 
   int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
   int2 in_pos_in_one_block1 =
-          ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
 
   int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
   int2 in_pos_in_one_block2 =
-          ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
 
   int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
   int2 in_pos_in_one_block3 =
-          ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-
+      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
 
   half4 output0 = 0.0f;
   half4 output1 = 0.0f;
@@ -2390,7 +2629,8 @@ __kernel void convBNAdd_1x1_spl(
 
   for (int i = 0; i < input_c; ++i) {
     // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
     half4 input0 = read_imageh(input_image, sampler, pos_in);
 
     half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
@@ -2404,7 +2644,8 @@ __kernel void convBNAdd_1x1_spl(
     output0 = mad(input0.w, weight3, output0);
 
     // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
     half4 input1 = read_imageh(input_image, sampler, pos_in);
     //
     //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
@@ -2419,7 +2660,8 @@ __kernel void convBNAdd_1x1_spl(
     output1 = mad(input1.w, weight3, output1);
 
     // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
     half4 input2 = read_imageh(input_image, sampler, pos_in);
 
     //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
@@ -2434,7 +2676,8 @@ __kernel void convBNAdd_1x1_spl(
     output2 = mad(input2.w, weight3, output2);
 
     // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
     half4 input3 = read_imageh(input_image, sampler, pos_in);
 
     //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
@@ -2450,29 +2693,29 @@ __kernel void convBNAdd_1x1_spl(
   }
 
 #ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef BIASE_CH
-    output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    output0 += read_imageh(bias, sampler, output_pos0);
-    output1 += read_imageh(bias, sampler, output_pos1);
-    output2 += read_imageh(bias, sampler, output_pos2);
-    output3 += read_imageh(bias, sampler, output_pos3);
+  output0 += read_imageh(bias, sampler, output_pos0);
+  output1 += read_imageh(bias, sampler, output_pos1);
+  output2 += read_imageh(bias, sampler, output_pos2);
+  output3 += read_imageh(bias, sampler, output_pos3);
 #endif
 
 #ifdef RELU
@@ -2486,22 +2729,108 @@ __kernel void convBNAdd_1x1_spl(
     write_imageh(output_image, output_pos0, output0);
   }
 
-  if (out_w1 < old_w){
+  if (out_w1 < old_w) {
     write_imageh(output_image, output_pos1, output1);
   }
 
-  if (out_w2 < old_w){
+  if (out_w2 < old_w) {
     write_imageh(output_image, output_pos2, output2);
   }
 
-  if (out_w3 < old_w){
+  if (out_w3 < old_w) {
     write_imageh(output_image, output_pos3, output3);
   }
 }
 
+__kernel void depth_conv(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input,
+    __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+    __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __private const int filter_width, __private const int filter_height) {
 
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
 
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE_CH
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  half4 output = read_imageh(bias, sampler, output_pos);
+#else
+  half4 output = 0.0f;
+#endif
 
+  int2 pos_in_input_block =
+      (int2)(out_c * input_width, batch_index * input_height);
+  int2 pos_in_filter_block =
+      (int2)(out_c * filter_width, batch_index * filter_height);
+  int filter_x = pos_in_filter_block.x;
+  int filter_y = pos_in_filter_block.y;
+  int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x;
+  int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y;
+  int2 align = {filter_width / 2, filter_height / 2};
+  /*  if (output_pos.x == 0 && output_pos.y == 0){
+      printf("align.x=%d  align.y=%d \n ",align.x,align.y);
+      printf("stride=%d \n ",stride);
+    }*/
+  for (int fy = 0; fy < filter_height; ++fy) {
+    for (int fx = 0; fx < filter_width; ++fx) {
+      int x_off = fx - align.x;
+      int y_off = fy - align.y;
+      /*      if (output_pos.x == 0 && output_pos.y == 0){
+              printf("fx=%d  fy=%d \n ",fx,fy);
+              printf("x_off=%d  y_off=%d \n ",x_off,y_off);
+            }*/
+      half4 in = select(
+          read_imageh(input, sampler,
+                      (int2)(input_x_base + x_off, input_y_base + y_off)),
+          (half4)(0.0f),
+          (ushort4)((in_pos_in_one_block.x + x_off < 0 ||
+                     in_pos_in_one_block.y + y_off < 0 ||
+                     in_pos_in_one_block.x + x_off >= input_width ||
+                     in_pos_in_one_block.y + y_off >= input_height)
+                    << 15));
+      half4 f =
+          read_imageh(filter, sampler, (int2)(filter_x + fx, filter_y + fy));
+      output += in * f;
+      /*if (output_pos.x ==111  && output_pos.y == 0){
+        printf("in={ %f , %f , %f , %f } \n
+      ",convert_float(in.x),convert_float(in.y),convert_float(in.z),convert_float(in.w));
+        printf("filter={ %f , %f , %f , %f } \n
+      ",convert_float(f.x),convert_float(f.y),convert_float(f.z),convert_float(f.w));
+        printf("output={ %f , %f , %f , %f } \n
+      ",convert_float(output.x),convert_float(output.y),convert_float(output.z),convert_float(output.w));
+      }*/
+    }
+  }
+#ifdef BATCH_NORM
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
 
-
-
+#ifdef RELU
+  output = activation(output);
+#endif
+  write_imageh(output_image, output_pos, output);
+}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
index b975eb405633b3d7252aea30671818066459b3ea..b7f4d16c3bb54b7f28d379e38724c5de8cf9dd06 100644
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
@@ -13,33 +13,119 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
-     int x = get_global_id(0);
-     int y = get_global_id(1);
-     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-     int2 coords;
-     coords.x = x;
-     coords.y = y;
-     half4 in = read_imageh(input, sampler, coords);
-     half4 biase = read_imageh(bias, sampler, coords);
-     half4 output = in * biase;
-     write_imageh(outputImage,coords,output);
- }
-
-
-__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only
-image2d_t outputImage, int w) {
+__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
+                              __write_only image2d_t outputImage) {
   int x = get_global_id(0);
   int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  half4 in = read_imageh(input, sampler, coords);
+  half4 biase = read_imageh(bias, sampler, coords);
+  half4 output = in * biase;
+  write_imageh(outputImage, coords, output);
+}
+
+__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,
+                          __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
   int2 coords;
   coords.x = x;
   coords.y = y;
   int2 coords_bias;
-  coords_bias.x = x/w;
+  coords_bias.x = x / w;
   coords_bias.y = 0;
   half4 in = read_imageh(input, sampler, coords);
   half4 biase = read_imageh(bias, sampler, coords_bias);
   half4 output = in * biase;
-  write_imageh(outputImage,coords,output);
+  write_imageh(outputImage, coords, output);
+}
+
+// etc : 1 1 1 72
+// run time Y  [value,0,0,0] * 72
+__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
+                             __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+
+  int2 coords_bias0;
+  int2 coords_bias1;
+  int2 coords_bias2;
+  int2 coords_bias3;
+
+  /*  if (x == 0 && y == 0) {
+      half4 b = (half4){0, 0, 0, 0};
+  #define PPI(j, k)                                                          \
+    b = read_imageh(bias, sampler, (int2){j, k});                            \
+    printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \
+           convert_float(b.y), convert_float(b.z), convert_float(b.w));
+      for (int i = 0; i < 73; ++i) {
+        PPI(i, 0);
+      }
+  #undef PPI
+    }*/
+
+  coords_bias0.x = x / w * 4;
+  coords_bias0.y = 0;
+
+  coords_bias1.x = x / w * 4 + 1;
+  coords_bias1.y = 0;
+
+  coords_bias2.x = x / w * 4 + 2;
+  coords_bias2.y = 0;
+
+  coords_bias3.x = x / w * 4 + 3;
+  coords_bias3.y = 0;
+
+  half4 biase0 = read_imageh(bias, sampler, coords_bias0);
+  half4 biase1 = read_imageh(bias, sampler, coords_bias1);
+  half4 biase2 = read_imageh(bias, sampler, coords_bias2);
+  half4 biase3 = read_imageh(bias, sampler, coords_bias3);
+  /*  if (x == 0 && y == 0) {
+      printf("bias0={ %f , %f , %f , %f }\n ",
+             convert_float(biase0.x), convert_float(biase0.y),
+             convert_float(biase0.z), convert_float(biase0.w));
+
+      printf("bias1={ %f , %f , %f , %f }\n ",
+             convert_float(biase1.x), convert_float(biase1.y),
+             convert_float(biase1.z), convert_float(biase1.w));
+      printf("bias2={ %f , %f , %f , %f }\n ",
+             convert_float(biase2.x), convert_float(biase2.y),
+             convert_float(biase2.z), convert_float(biase2.w));
+      printf("bias3={ %f , %f , %f , %f }\n ",
+             convert_float(biase3.x), convert_float(biase3.y),
+             convert_float(biase3.z), convert_float(biase3.w));
+    }*/
+  half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
+  half4 in = read_imageh(input, sampler, coords);
+  half4 output = mad(in, biase, 0);
+  write_imageh(outputImage, coords, output);
 }
+
+__kernel void channel_mul_d4(__global image2d_t input, __global image2d_t bias,
+                          __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+  half4 in = read_imageh(input, sampler, coords);
+  half4 biase = read_imageh(bias, sampler, coords_bias);
+  half4 output = in * biase;
+  write_imageh(outputImage, coords, output);
+}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1f62ff377a7f8fddaeae108a8cfaa6d98847f9af
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
@@ -0,0 +1,27 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void elementwise_sub(__global image2d_t inputImage, __global image2d_t bias, __write_only image2d_t outputImage) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     half4 input = read_imageh(inputImage, sampler, coords);
+     half4 biase = read_imageh(bias, sampler, coords);
+     half4 output = input - biase;
+     write_imageh(outputImage, coords, output);
+ }
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
new file mode 100644
index 0000000000000000000000000000000000000000..8c74477b6abca02d81cb38db2412ee55175f642c
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
@@ -0,0 +1,159 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void expend_c1(
+    __private const int OUT_C, __private const int OUT_W,
+    __private const int OUT_NH,
+
+    __private const int IN_C, __private const int IN_W,
+    __private const int IN_NH,
+
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+
+    __read_only image2d_t input, __write_only image2d_t output,
+    __private const int n_times, __private const int c_times,
+    __private const int h_times, __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+
+  //  const real_in_c = out_c * 4 / c_times;
+  //  const int in_c = real_in_c / 4;
+  const int in_c = 0;
+
+  //  const int in_c = out_c / c_times;
+  const int in_w = out_w / w_times;
+
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  half4 in = read_imageh(input, sampler, input_pos);
+  in.y = in.x;
+  in.z = in.x;
+  in.w = in.x;
+  write_imageh(output, output_pos, in);
+}
+
+__kernel void expend_c2(
+    __private const int OUT_C, __private const int OUT_W,
+    __private const int OUT_NH,
+
+    __private const int IN_C, __private const int IN_W,
+    __private const int IN_NH,
+
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+
+    __read_only image2d_t input, __write_only image2d_t output,
+    __private const int n_times, __private const int c_times,
+    __private const int h_times, __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+
+  //  const real_in_c = out_c * 4 / c_times;
+  //  const int in_c = real_in_c / 4;
+  const int in_c = 0;
+
+  //  const int in_c = out_c / c_times;
+  const int in_w = out_w / w_times;
+
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  half4 in = read_imageh(input, sampler, input_pos);
+  in.z = in.x;
+  in.w = in.y;
+  write_imageh(output, output_pos, in);
+}
+
+
+__kernel void expend_c4(
+    __private const int OUT_C, __private const int OUT_W,
+    __private const int OUT_NH,
+
+    __private const int IN_C, __private const int IN_W,
+    __private const int IN_NH,
+
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+
+    __read_only image2d_t input, __write_only image2d_t output,
+    __private const int n_times, __private const int c_times,
+    __private const int h_times, __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+
+  //  const real_in_c = out_c * 4 / c_times;
+  //  const int in_c = real_in_c / 4;
+  const int in_c = 0;
+
+  //  const int in_c = out_c / c_times;
+  const int in_w = out_w / w_times;
+
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  half4 in = read_imageh(input, sampler, input_pos);
+  write_imageh(output, output_pos, in);
+}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..0512ce9beab00f1d9b8036d65385a14743ff7e31
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
@@ -0,0 +1,99 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "cl_common.h"
+
+__kernel void grid_sampler(__private const int out_height,
+                           __private const int out_width,
+                           __read_only image2d_t input,
+                           __read_only image2d_t grid,
+                           __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2) * 4;
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int x_grid = out_h / 4 * 2;
+  int y_grid = out_n * out_width + out_w;
+  float4 g1 = read_imagef(grid, sampler, (int2)(x_grid, y_grid));
+  float4 g2 = read_imagef(grid, sampler, (int2)(x_grid + 1, y_grid));
+
+  float x = (g1.x + 1) * (out_width - 1) / 2;
+  float y = (g2.x + 1) * (out_height - 1) / 2;
+  float x0 = floor(x);
+  float y0 = floor(y);
+  int x_p = out_c * out_width + x0;
+  int y_p = out_n * out_height + y0;
+  int x_out = out_c * out_width + out_w;
+  int y_out = out_n * out_height + out_h;
+  float4 input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  float4 input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  float4 input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  float4 input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  float4 out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out), convert_half4(out_val));
+
+  x = (g1.y + 1) * (out_width - 1) / 2;
+  y = (g2.y + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out + 1), convert_half4(out_val));
+
+  x = (g1.z + 1) * (out_width - 1) / 2;
+  y = (g2.z + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out + 2), convert_half4(out_val));
+
+  x = (g1.w + 1) * (out_width - 1) / 2;
+  y = (g2.w + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out + 3), convert_half4(out_val));
+}
diff --git a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
index a3a469dc8699fc0b185794d681cacb27d9f352ec..1772cd275b77901b2dfa389fec1c521cdfc85bac 100644
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -174,6 +174,16 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
                                  build_options);
     }
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    // other depthwise not with filter 3x3
+    DLOG << "depth_conv basic ";
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
+
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -214,6 +224,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
                     param.NewScale(), param.NewBias());
       break;
diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
index a0e890a70b31e36c1743ae35b54fc5cb0446a8b3..94ffc001b4cbba7dc31f5073612cc01b47b7ec5c 100644
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -71,6 +71,14 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
                                  build_options);
     }
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
+
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -124,6 +132,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
diff --git a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
index 77738fe34c0f53816fcf726c0b2bc2f1c13a9010..370934849c08bca2a27411ea80468ec829e064ca 100644
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -72,6 +72,14 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
                                  build_options);
     }
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    DLOG << "init depwise conv basic";
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -130,6 +138,7 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
       break;
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
diff --git a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
index c8cb97c2e2b6c23dbc0558593bb6200b286a63e2..02fdfb782e8e052ed3d4206e886bb2d50944a68f 100644
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -129,6 +129,14 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
                                  build_options);
     }
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -168,6 +176,7 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
                     param.NewBias());
       break;
diff --git a/mobile/src/operators/kernel/cl/conv_kernel.cpp b/mobile/src/operators/kernel/cl/conv_kernel.cpp
index 2859715b9c9ff9f7653849dcd952ed2d148e2f53..0965e5feb200a0c0d4f3489d0e241eb043e7f93f 100644
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
@@ -66,6 +66,14 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
     }
     DLOG << "depth_conv 3x3";
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file);
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -115,6 +123,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param);
       break;
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
index 0e63ccb095667529feb5dc28344bd54fbbd5b7cb..ecfc5fbd10bd7ff027d2d731805d63fc86821837 100644
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
@@ -72,6 +72,14 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
 
     DLOG << "depth_conv 3x3";
 
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
   } else if (param->Filter()->dims()[2] == 3 &&
              param->Filter()->dims()[3] == 3) {
     //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -120,6 +128,7 @@ void ConvReluKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true);
       break;
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
diff --git a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
index 8d66b50a99a6cd07de8dcf32867f1cb3c28d2232..4261681f3ec2b740516a42785bee30dc843b3a71 100644
--- a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
@@ -43,7 +43,10 @@ bool ConvTransposeKernel<GPU_CL, float>::Init(
     this->cl_helper_.AddKernel("conv_transpose3x3s2",
                                "conv_transpose_kernel.cl");
   } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+    param->ExecMode() = ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS_FLOAT;
+    param->Filter()->InitConv2dTransposeFilterCLImage(
+        cl_helper_.CLContext(), cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_transpose", "conv_transpose_kernel.cl");
   }
   return true;
 }
@@ -58,6 +61,9 @@ void ConvTransposeKernel<GPU_CL, float>::Compute(
     case ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS3x3s2_FLOAT:
       ConvTranspose3x3s2AddBnRelu(&this->cl_helper_, param);
       break;
+    case ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS_FLOAT:
+      ConvTransposeAddBnRelu(&this->cl_helper_, param);
+      break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION(
           "Invalid convolution transpose execute mode %d", param.ExecMode());
diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
index 9f2aca78509ea45525f1dcd39a7a8154ca75060e..37034a01899d8246abfa5dcf419637e643eff924 100644
--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifdef ELEMENTWISEMUL_OP
 
 #include "operators/kernel/elementwise_mul_kernel.h"
+#include <framework/cl/cl_half.h>
+#include <iostream>
 #include "framework/cl/cl_image.h"
 
 namespace paddle_mobile {
@@ -23,19 +25,31 @@ namespace operators {
 template <>
 bool ElementwiseMulKernel<GPU_CL, float>::Init(
     ElementwiseMulParam<GPU_CL> *param) {
-  DLOG << "-----init add-----";
   framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
       const_cast<framework::CLImage *>(param->InputY()));
   if (bias->dims() == param->InputX()->dims()) {
+    DLOG << "init element wise mul";
     this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
-  } else if (bias->dims().size() == 4) {
-    this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
   } else {
-    DLOG << "error:bias dims is error";
+    const int bias_dim_size = bias->dims().size();
+    if (bias_dim_size == 1) {
+      DLOG << "init channel_mul";
+      this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
+    } else if (bias_dim_size == 2) {
+      // etc. input  1 72 28 28
+      // filter 1 72
+      DLOG << "init channel_mul_d2";
+      this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
+    } else if (bias_dim_size == 4) {
+      DLOG << "init channel_mul_d4";
+      this->cl_helper_.AddKernel("channel_mul_d4", "elementwise_mul_kernel.cl");
+    } else {
+      PADDLE_MOBILE_ENFORCE(false,
+                            "element mul not supported this situation yet");
+    }
   }
   return true;
 }
-
 template <>
 void ElementwiseMulKernel<GPU_CL, float>::Compute(
     const ElementwiseMulParam<GPU_CL> &param) {
@@ -64,34 +78,103 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
         clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                                NULL, global_work_size, NULL, 0, NULL, NULL);
     CL_CHECK_ERRORS(status);
-  } else if (bias->dims().size() == 4) {
-    DLOG << "zp7 444";
-    cl_mem input_image = input->GetCLImage();
-    cl_mem bias_image = bias->GetCLImage();
-    cl_mem output_image = output->GetCLImage();
-    int tensor_w = input->dims()[input->dims().size() - 1];
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&input_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&bias_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&output_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                            reinterpret_cast<void *>(&tensor_w));
-    CL_CHECK_ERRORS(status);
-    auto width = input->ImageWidth();
-    auto height = input->ImageHeight();
-    DLOG << "dede:" << width << "," << height;
-    size_t global_work_size[2] = {width, height};
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
   } else {
-    DLOG << "error:bias dims is error";
+    const int bias_dim_size = bias->dims().size();
+    if (bias_dim_size == 1) {
+      DLOG << "channel mul";
+      cl_mem input_image = input->GetCLImage();
+      cl_mem bias_image = bias->GetCLImage();
+      cl_mem output_image = output->GetCLImage();
+      int tensor_w = input->dims()[input->dims().size() - 1];
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&input_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&bias_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&output_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                              reinterpret_cast<void *>(&tensor_w));
+      CL_CHECK_ERRORS(status);
+      auto width = input->ImageWidth();
+      auto height = input->ImageHeight();
+      size_t global_work_size[2] = {width, height};
+      status =
+          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                 NULL, global_work_size, NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+    } else if (bias_dim_size == 2) {
+      DLOG << "channel mul d2";
+
+      // etc. input  1 72 28 28
+      // filter 1 72   -->  1 1 1 72
+      DLOG << "input->ImageDims():  " << input->ImageDims();
+      DLOG << "bias->ImageDims():  " << bias->ImageDims();
+      DLOG << "out->ImageDims():  " << output->ImageDims();
+
+      DLOG << "channel mul d2";
+      cl_mem input_image = input->GetCLImage();
+      cl_mem bias_image = bias->GetCLImage();
+      cl_mem output_image = output->GetCLImage();
+      int tensor_w = input->dims()[input->dims().size() - 1];
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&input_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&bias_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&output_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                              reinterpret_cast<void *>(&tensor_w));
+      CL_CHECK_ERRORS(status);
+      auto width = input->ImageWidth();
+      auto height = input->ImageHeight();
+      size_t global_work_size[2] = {width, height};
+      status =
+          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                 NULL, global_work_size, NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+
+      //    bias->PrintTensor(*bias);
+    } else if (bias_dim_size == 4) {
+      DLOG << "channel_mul_d4";
+      // etc. input  1 72 28 28
+      // filter 1 72   -->  1 1 1 72
+      DLOG << "input->ImageDims():  " << input->ImageDims();
+      DLOG << "bias->ImageDims():  " << bias->ImageDims();
+      DLOG << "out->ImageDims():  " << output->ImageDims();
+
+      DLOG << "channel mul d2";
+      cl_mem input_image = input->GetCLImage();
+      cl_mem bias_image = bias->GetCLImage();
+      cl_mem output_image = output->GetCLImage();
+      int tensor_w = input->dims()[input->dims().size() - 1];
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&input_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&bias_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&output_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                              reinterpret_cast<void *>(&tensor_w));
+      CL_CHECK_ERRORS(status);
+      auto width = input->ImageWidth();
+      auto height = input->ImageHeight();
+      size_t global_work_size[2] = {width, height};
+      status =
+          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                 NULL, global_work_size, NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+    } else {
+      PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
+    }
   }
 }
 
diff --git a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b107b3de3c1df163e9f987c9a8cdff23b6a71c43
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#include "operators/kernel/elementwise_sub_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseSubKernel<GPU_CL, float>::Init(
+    ElementwiseSubParam<GPU_CL> *param) {
+  framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
+      const_cast<framework::CLImage *>(param->InputY()));
+  if (bias->dims().size() == 4) {
+    if (!bias->isInit()) {
+      bias->InitNormalCLImage(cl_helper_.CLContext(),
+                              this->cl_helper_.CLCommandQueue());
+    }
+    DLOG << " bias: " << *bias;
+    this->cl_helper_.AddKernel("elementwise_sub", "elementwise_sub_kernel.cl");
+  } else {
+    DLOG << "error:bias dims not support";
+  }
+  return true;
+}
+
+template <>
+void ElementwiseSubKernel<GPU_CL, float>::Compute(
+    const ElementwiseSubParam<GPU_CL> &param) {
+  auto input = param.InputX();
+  auto bias = param.InputY();
+  auto output = param.Out();
+  cl_int status;
+  auto kernel = this->cl_helper_.KernelAt(0);
+  if (bias->dims().size() == 4) {
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bias_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    auto width = input->ImageWidth();
+    auto height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else {
+    DLOG << "error:bias dims not support";
+  }
+}
+
+template class ElementwiseSubKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/mobile/src/operators/kernel/cl/expand_kernel.cpp b/mobile/src/operators/kernel/cl/expand_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f424a31b4f5e143b8662376ddfcec122beffb408
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/expand_kernel.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef EXPAND_OP
+
+#include "operators/kernel/expand_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ExpandKernel<GPU_CL, float>::Init(ExpandParam<GPU_CL>* param) {
+  const framework::DDim& input_dims = param->InputX()->dims();
+  PADDLE_MOBILE_ENFORCE(input_dims.size() == 4,
+                        "expend now support 4 size dims");
+  if (input_dims[1] == 1) {
+    this->cl_helper_.AddKernel("expend_c1", "expend.cl");
+  } else if (input_dims[1] == 2) {
+    this->cl_helper_.AddKernel("expend_c2", "expend.cl");
+  } else if (input_dims[1] == 4) {
+    this->cl_helper_.AddKernel("expend_c4", "expend.cl");
+  } else {
+    PADDLE_MOBILE_ENFORCE(false, "expend did not supported this type");
+  }
+  return true;
+}
+
+template <>
+void ExpandKernel<GPU_CL, float>::Compute(const ExpandParam<GPU_CL>& param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  DLOG << "param.Out()->dims():  " << param.Out()->dims();
+  const framework::DDim& image_dims = param.Out()->ImageDims();
+  DLOG << "param.Out()->image_dims():  " << image_dims;
+
+  auto out_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
+  DLOG << "out_work_size:  " << out_work_size;
+
+  int out_c_block = out_work_size[0];
+  int out_w = out_work_size[1];
+  int out_nh = out_work_size[2];
+
+  auto in_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
+  int in_c_block = in_work_size[0];
+  int in_w = in_work_size[1];
+  int in_nh = in_work_size[2];
+
+  int input_width = param.InputX()->dims()[3];
+  int input_height = param.InputX()->dims()[2];
+  int output_width = param.Out()->dims()[3];
+  int output_height = param.Out()->dims()[2];
+
+  const auto* input = param.InputX();
+  auto* output = param.Out();
+  vector<int> expandTimes = {1, 1, 1, 1};
+  DLOG << "param.expand_times: " << param.expand_times;
+
+  for (int i = 0; i < param.expand_times.size(); ++i) {
+    expandTimes[i] = param.expand_times[i];
+  }
+
+  DLOG << "expandTimes: " << expandTimes;
+
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+
+  input->dims();
+
+  int idx = 0;
+
+  cl_int status;
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &inputImage);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &outputImage);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[0]);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[1]);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[2]);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[3]);
+  CL_CHECK_ERRORS(status);
+
+  status =
+      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                             out_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+
+  DLOG << *output;
+}
+
+template class ExpandKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a20ebd94e02ca489d9364ab3673d2bec866db2e
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRID_SAMPLER_OP
+
+#include "operators/kernel/grid_sampler_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool GridSamplerKernel<GPU_CL, float>::Init(GridSamplerParam<GPU_CL>* param) {
+  this->cl_helper_.AddKernel("grid_sampler", "grid_sampler_kernel.cl");
+  return true;
+}
+
+template <>
+void GridSamplerKernel<GPU_CL, float>::Compute(
+    const GridSamplerParam<GPU_CL>& param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Output()));
+  cl_int status;
+  auto output = param.Output();
+  auto input = param.InputX();
+  auto grid = param.Grid();
+  auto output_image = output->GetCLImage();
+  auto input_image = input->GetCLImage();
+  auto grid_image = grid->GetCLImage();
+  const int out_H = output->dims()[2];
+  const int out_W = output->dims()[3];
+
+  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &out_H);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_int), &out_W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &grid_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status);
+
+  const size_t work_size[3] = {default_work_size[0], default_work_size[1],
+                               default_work_size[2] / 4};
+
+  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3,
+                                  NULL, work_size, NULL, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(status);
+}
+
+template class GridSamplerKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
index f068d36133e826e8caa79d8f4852bbaac4415cdd..439554ec10696913b42923177828870790f0f711 100644
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
@@ -23,7 +23,7 @@ namespace operators {
 
 template <>
 bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
-  auto &dims = param->Out()->dims();
+  auto &dims = param->OutputY()->dims();
   const int h = dims[2];
   std::string build_options = "";
   if (h == 128) {
@@ -41,7 +41,8 @@ bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
 template <>
 void InstanceNormKernel<GPU_CL, float>::Compute(
     const InstanceNormParam<GPU_CL> &param) {
-  InstanceNorm(&this->cl_helper_, param);
+  InstanceNorm(&this->cl_helper_, param.InputX(), param.OutputY(),
+               param.Epsilon());
 }
 
 template class InstanceNormKernel<GPU_CL, float>;
diff --git a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
index c265454d0ea67c7a6aec8f1017bc5455d328a756..270d77c4a051df227719338f6793e64aa2920f9f 100644
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
@@ -23,7 +23,7 @@ namespace operators {
 
 template <>
 bool InstanceNormReluKernel<GPU_CL, float>::Init(
-    InstanceNormParam<GPU_CL> *param) {
+    FusionInstanceNormReluParam<GPU_CL> *param) {
   auto &dims = param->Out()->dims();
   const int h = dims[2];
   std::string build_options = "-DRELU";
@@ -41,8 +41,8 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init(
 
 template <>
 void InstanceNormReluKernel<GPU_CL, float>::Compute(
-    const InstanceNormParam<GPU_CL> &param) {
-  InstanceNorm(&this->cl_helper_, param);
+    const FusionInstanceNormReluParam<GPU_CL> &param) {
+  InstanceNorm(&this->cl_helper_, param.InputX(), param.Out(), param.Epsilon());
 }
 
 template class InstanceNormReluKernel<GPU_CL, float>;
diff --git a/mobile/src/operators/kernel/expand_kernel.h b/mobile/src/operators/kernel/expand_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..00c12a9372eeb533c03bcc038edeec01eff3f3bf
--- /dev/null
+++ b/mobile/src/operators/kernel/expand_kernel.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+#ifdef EXPAND_OP
+DECLARE_KERNEL(Expand, ExpandParam);
+#endif  // EXPAND_OP
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
old mode 100644
new mode 100755
index 716531fcab47252c86486d2cb1f325ca97423935..8442eef8b2314d5035d673c12dd87590cfb8064d
--- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
@@ -53,6 +53,15 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
   concatArgs.channel_num = channel_num;
   concatArgs.height = height;
   concatArgs.width = width;
+
+  auto deleter = [](void *p) { fpga::fpga_free(p); };
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.images_in), deleter));
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.scales_in), deleter));
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.channel_num), deleter));
+
   param->SetFpgaArgs(concatArgs);
   return true;
 }
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
old mode 100755
new mode 100644
index 57ccf9f00d9e4ab04bbed16af8b02e4aaa537847..54ae3b6712cbb1a5fac96fdee1e8bff84b0161be
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef ELEMENTWISEADD_OP
-#include <math.h>
 #include "operators/kernel/elementwise_add_kernel.h"
+#include <math.h>
 
 namespace paddle_mobile {
 namespace operators {
@@ -33,8 +33,8 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
   float C1 = Si_1 / So;
   float C2 = Si_2 / So;
   fpga::EWAddArgs ewaddArgs = {0};
-  ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
-  ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
+  ewaddArgs.const0 = 1;
+  ewaddArgs.const1 = 1;
   ewaddArgs.relu_enabled = 0;
   ewaddArgs.image0.address = input_x_ptr;
   ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
@@ -62,14 +62,14 @@ void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) {
   int inputh = ewaddArgs.image0.height;
   int inputw = ewaddArgs.image0.width;
   float inScale0 =
-          (reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
+      (reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
   float inScale1 =
-          (reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
+      (reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
   float outScale =
-          (reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
-  int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
-  int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
-  int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
+      (reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
+  int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
+  int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
+  int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
   int datasize = inputc * inputh * inputw;
   float const0 = inScale0 / outScale;
   float const1 = inScale1 / outScale;
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
old mode 100755
new mode 100644
index de603418742da5b9672259a1bb414567853a8cb5..c406a22d568404afae7238c579cfac5f172772b3
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
-#include <math.h>
 #include "operators/kernel/elementwise_add_relu_kernel.h"
+#include <math.h>
 
 namespace paddle_mobile {
 namespace operators {
@@ -35,8 +35,8 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
   float C2 = Si_2 / So;
   fpga::EWAddArgs ewaddArgs = {0};
   ewaddArgs.relu_enabled = 1;
-  ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
-  ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
+  ewaddArgs.const0 = 1;
+  ewaddArgs.const1 = 1;
   ewaddArgs.image0.address = input_x_ptr;
   ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
   ewaddArgs.image0.scale_address = input_x->scale;
@@ -63,14 +63,14 @@ void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) {
   int inputh = ewaddArgs.image0.height;
   int inputw = ewaddArgs.image0.width;
   float inScale0 =
-          (reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
+      (reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
   float inScale1 =
-          (reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
+      (reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
   float outScale =
-          (reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
-  int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
-  int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
-  int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
+      (reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
+  int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
+  int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
+  int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
   int datasize = inputc * inputh * inputw;
   float const0 = inScale0 / outScale;
   float const1 = inScale1 / outScale;
diff --git a/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
index 50179b9cd55d772f035e83fdc54ee98f38de1c54..c2f8b55c1eef1c9aa1e46bee5e30ffaa7525fa2a 100644
--- a/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
@@ -331,7 +331,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
     keep_nms.Resize({post_nms_top_n});
   }
 
-  proposals.mutable_data<T>({keep_nms.numel(), 4});   // original
+  proposals.mutable_data<T>({keep_nms.numel(), 4});        // original
   scores_sel.mutable_data<int8_t>({keep_nms.numel(), 1});  // original
 
   CPUGather<T>(bbox_sel, keep_nms, &proposals);
@@ -371,8 +371,8 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
   for (int h = 0; h < score_height; h++) {
     for (int w = 0; w < score_width; w++) {
       for (int c = 0; c < score_channels; ++c) {
-        int dstidx = h*unalignedCW + w*score_channels + c;
-        int srcidx = h*alignedCW + w*score_channels + c;
+        int dstidx = h * unalignedCW + w * score_channels + c;
+        int srcidx = h * alignedCW + w * score_channels + c;
         score_tensor.data<int8_t>()[dstidx] = input_score_data[srcidx];
       }
     }
@@ -388,11 +388,11 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
   for (int h = 0; h < bbox_height; h++) {
     for (int w = 0; w < bbox_width; w++) {
       for (int c = 0; c < bbox_channels; ++c) {
-        int dstidx = h*unalignedCW + w*bbox_channels + c;
-        int srcidx = h*alignedCW + w*bbox_channels + c;
+        int dstidx = h * unalignedCW + w * bbox_channels + c;
+        int srcidx = h * alignedCW + w * bbox_channels + c;
         bbox_tensor->data<float>()[dstidx] =
-            (static_cast<int>(input_bbox_data[srcidx]))/127.0*
-               input_bbox->scale[0];
+            (static_cast<int>(input_bbox_data[srcidx])) / 127.0 *
+            input_bbox->scale[0];
       }
     }
   }
@@ -412,14 +412,14 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
   float min_size = param.min_size_;
   float eta = param.eta_;
 
-  rpn_rois->mutable_data<float>({bbox_tensor->numel()/4, 4});
-  rpn_roi_probs->mutable_data<int8_t>({input_score->numel()/4, 1});
+  rpn_rois->mutable_data<float>({bbox_tensor->numel() / 4, 4});
+  rpn_roi_probs->mutable_data<int8_t>({input_score->numel() / 4, 1});
   framework::LoD lod;
   lod.resize(1);
   auto &lod0 = lod[0];
   lod0.push_back(0);
-  anchors.Resize({anchors.numel()/4, 4});
-  variances.Resize({variances.numel()/4, 4});
+  anchors.Resize({anchors.numel() / 4, 4});
+  variances.Resize({variances.numel() / 4, 4});
 
   int64_t num_proposals = 0;
   for (int64_t i = 0; i < score_n; ++i) {
diff --git a/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
index 87948f824e353ef3a3c341a0a1ecee5957e871d6..00c0b5d6317dfa456cb9a285390b13a4397b5b61 100644
--- a/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
@@ -143,7 +143,6 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
       "the channels of input X should equal the product of "
       "output_channels x pooled_height x pooled_width");
 
-
   auto output_data = out->mutable_data<float>();
   auto input_rois = rois->data<float>();
 
@@ -173,11 +172,11 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
 
     for (int ph = 0; ph < pooled_height; ph++) {
       for (int pw = 0; pw < pooled_width; pw++) {
-        PSROIPoolingForward<float>(
-            input_data, height, width, input_channels, offset_output_data,
-            pooled_height, pooled_width, output_channels, input_rois,
-            bin_size_h, bin_size_w, roi_start_h, roi_start_w, pw, ph,
-            scale, roi_batch_ind);
+        PSROIPoolingForward<float>(input_data, height, width, input_channels,
+                                   offset_output_data, pooled_height,
+                                   pooled_width, output_channels, input_rois,
+                                   bin_size_h, bin_size_w, roi_start_h,
+                                   roi_start_w, pw, ph, scale, roi_batch_ind);
       }
     }
   }
diff --git a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
index fcf0889b4a66919efc677e211a1da453fd761de4..5b651ad6e66409805759c82477161127f5045dea 100644
--- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
@@ -110,7 +110,26 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
     }
   }
   output->Resize(framework::make_ddim(shape));
+
+  bool reshapeNeedFlg = 1;
   if (output->dims() == input->dims()) {
+    reshapeNeedFlg = 0;
+  } else if (output->dims().size() != input->dims().size()) {
+    auto inputdimsize = input->dims().size();
+    auto outputdimsize = output->dims().size();
+    int smallersize =
+        inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
+    int i = 0;
+    for (i = 0; i < smallersize; i++) {
+      if ((input->dims())[i] != (output->dims())[i]) break;
+    }
+    if (i == smallersize) {
+      reshapeNeedFlg = 0;
+    }
+  }
+  if (reshapeNeedFlg) {
+    reshape(input, output);
+  } else {
     DLOG << "No need to reshape";
     output->ShareDataWith(*input);
     framework::LoD lod = input->lod();
@@ -118,9 +137,6 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
     output->scale[0] = input->scale[0];
     return;
   }
-
-  reshape(input, output);
-  //
 }
 
 }  // namespace operators
diff --git a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
index a1500ecdb0246d4c7235de490437945ec381d5a4..e40242d5c201345a3f2c1031a4a03b3095d9ff65 100644
--- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -30,6 +30,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
   }
   return true;
 }
+
 template <>
 void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
   // Only support slicing in channel dimension
@@ -38,6 +39,8 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
 
   auto input = param.input_;
   auto output = param.output_;
+  int H = input->dims()[2];
+  int W = input->dims()[3];
   int HW = input->dims()[2] * input->dims()[3];
   int channel = input->dims()[1];
   auto input_ptr = input->data<int8_t>();
@@ -53,10 +56,31 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
   end = end > channel ? channel : end;
   int len = end - start;
   size_t size = len * sizeof(int8_t);
+  DLOG << input->fpga_data_num;
+  fpga::fpga_invalidate(input_ptr, input->fpga_data_num * sizeof(int8_t));
+  DLOG << output->fpga_data_num;
+  fpga::fpga_invalidate(output_ptr, output->fpga_data_num * sizeof(int8_t));
+  int unalignedWC = len * W;
+  int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT);
 
-  for (int i = 0; i < HW; i++) {
-    memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+  if (unalignedWC != alignedWC) {
+    auto tmpOutput =
+        reinterpret_cast<int8_t*>(fpga::fpga_malloc(len * HW * sizeof(int8_t)));
+    for (int i = 0; i < HW; i++) {
+      memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
+    }
+    for (int i = 0; i < H; i++) {
+      for (int j = 0; j < unalignedWC; j++) {
+        *(output_ptr + alignedWC * i + j) = *(tmpOutput + unalignedWC * i + j);
+      }
+    }
+    fpga::fpga_free(tmpOutput);
+  } else {
+    for (int i = 0; i < HW; i++) {
+      memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+    }
   }
+  fpga::fpga_flush(output_ptr, output->fpga_data_num * sizeof(int8_t));
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/grid_sampler_kernel.h b/mobile/src/operators/kernel/grid_sampler_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbadb6b54abb3a800729444052ad6095e3384cb1
--- /dev/null
+++ b/mobile/src/operators/kernel/grid_sampler_kernel.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+#ifdef GRID_SAMPLER_OP
+DECLARE_KERNEL(GridSampler, GridSamplerParam);
+#endif  // GRID_SAMPLER_OP
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/instancenorm_relu_kernel.h b/mobile/src/operators/kernel/instancenorm_relu_kernel.h
index 9a4bedb564ea68e252f65372c38f3cfce13f339f..cb2a0e1f3cb739847cdf4f635de74c223896106b 100644
--- a/mobile/src/operators/kernel/instancenorm_relu_kernel.h
+++ b/mobile/src/operators/kernel/instancenorm_relu_kernel.h
@@ -30,10 +30,10 @@ using framework::OpKernelBase;
 
 template <typename DeviceType, typename T>
 class InstanceNormReluKernel
-    : public OpKernelBase<DeviceType, InstanceNormParam<DeviceType>> {
+    : public OpKernelBase<DeviceType, FusionInstanceNormReluParam<DeviceType>> {
  public:
-  void Compute(const InstanceNormParam<DeviceType> &param);
-  bool Init(InstanceNormParam<DeviceType> *param);
+  void Compute(const FusionInstanceNormReluParam<DeviceType> &param);
+  bool Init(FusionInstanceNormReluParam<DeviceType> *param);
 };
 
 }  // namespace operators
diff --git a/mobile/src/operators/nearest_interp_op.cpp b/mobile/src/operators/nearest_interp_op.cpp
index 14e71b78f123befd26125f9daa18e2e510844cdb..e885ea26adbcc42ed0feeefeb9077d22c734fcb2 100644
--- a/mobile/src/operators/nearest_interp_op.cpp
+++ b/mobile/src/operators/nearest_interp_op.cpp
@@ -24,8 +24,9 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const {
                         "Input(X) of BilinearInterOp should not be null.");
   PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
                         "Output(Out) of BilinearInterOp should not be null.");
-
   auto dim_x = this->param_.InputX()->dims();  // NCHW format
+  DLOG << "dim_x :" << dim_x;
+
   int out_h = this->param_.OutH();
   int out_w = this->param_.OutW();
   PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4");
@@ -37,8 +38,22 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const {
                           "OutSize's dimension size must be 1");
     PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2");
   }
-  std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
-  this->param_.Out()->Resize(framework::make_ddim(dim_out));
+
+  DLOG << "this->param_.HasScale(): " << this->param_.HasScale();
+  if (this->param_.HasScale()) {
+    const float scale = this->param_.Scale();
+    DLOG << "scale_:  " << scale;
+    std::vector<int64_t> dim_out({dim_x[0], dim_x[1],
+                                  static_cast<int>(dim_x[2] * scale),
+                                  static_cast<int>(dim_x[3] * scale)});
+    this->param_.Out()->Resize(framework::make_ddim(dim_out));
+    DLOG << "interp -- dim_out: " << dim_out;
+
+  } else {
+    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
+    this->param_.Out()->Resize(framework::make_ddim(dim_out));
+    DLOG << "interp -- dim_out: " << dim_out;
+  }
 }
 
 }  // namespace operators
diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h
index 2651a0f69766544a0ec09250248682c5b559ef01..f588b9fc79e1fe0a69dd00afe6419e0ef1e2aa5b 100644
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -337,12 +337,21 @@ class OpParam {
     return GetVarValue<T>("Filter", inputs, scope);
   }
 
+  template <typename T>
+  static T *GridFrom(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue<T>("Grid", inputs, scope);
+  }
+
   template <typename T>
   static const T GetAttr(const string &key, const AttributeMap &map) {
+    PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map",
+                          key.c_str())
     return ((Attribute)map.at(key)).Get<T>();
   }
   static const std::string GetStringAttr(const string &key,
                                          const AttributeMap &map) {
+    PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map",
+                          key.c_str())
     return ((Attribute)map.at(key)).GetString();
   }
 
@@ -350,6 +359,10 @@ class OpParam {
     return map.count(key) > 0;
   }
 
+  static const bool HasVar(const string &key, const VariableNameMap &var_map) {
+    return var_map.count(key) > 0;
+  }
+
   template <typename T>
   static T *GetVarValue(const string &key, const VariableNameMap &var_map,
                         const Scope &scope) {
@@ -484,6 +497,7 @@ class ConvParam : public OpParam {
     EXEC_SLIDINGWINDOW5x5_FLOAT,
     EXEC_SLIDINGWINDOW7x7_FLOAT,
     EXEC_GEMM1x1s1_FLOAT,
+    EXEC_DEPTHWISEBASIC_FLOAT,
   };
 
   ExecMode &ExecMode() const { return exec_mode_; }
@@ -927,6 +941,35 @@ class InstanceNormParam : public OpParam {
                     Scope *scope)
       : OpParam(inputs, outputs, attrs, scope) {
     input_x_ = InputXFrom<GType>(inputs, *scope);
+    output_y_ = OutputYFrom<GType>(outputs, *scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+  }
+
+  const GType *InputX() const { return input_x_; }
+
+  GType *OutputY() const { return output_y_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+ private:
+  GType *input_x_;
+  GType *output_y_;
+  float epsilon_;
+};
+#endif
+
+#ifdef FUSION_INSTANCENORM_RELU_OP
+template <typename Dtype>
+class FusionInstanceNormReluParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionInstanceNormReluParam(const VariableNameMap &inputs,
+                              const VariableNameMap &outputs,
+                              const AttributeMap &attrs, Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
     out_ = OutFrom<GType>(outputs, *scope);
     epsilon_ = GetAttr<float>("epsilon", attrs);
   }
@@ -2589,6 +2632,7 @@ class ConvTransposeParam : public OpParam {
     EXEC_DECONV4X4_FLOAT,
     EXEC_DEPTHWISETRANS_FLOAT,
     EXEC_CONVTRANS3x3s2_FLOAT,
+    EXEC_CONVTRANS_FLOAT,
   };
 
   ExecMode &ExecMode() const { return exec_mode_; }
@@ -3008,7 +3052,7 @@ class SplitParam : public OpParam {
   int axis;
   int num;
   std::vector<int> sections;
-  //  std::vector<GType> out_ts_;
+//  std::vector<GType> out_ts_;
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
@@ -3065,16 +3109,45 @@ class NearestInterpolationParam : public OpParam {
                             const AttributeMap &attrs, Scope *scope)
       : OpParam(inputs, outputs, attrs, scope) {
     input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_outsize_ = InputOutSizeFrom<GType>(inputs, *scope);
+    const bool has_out_size = HasVar("OutSize", inputs);
+
+    if (has_out_size) {
+      input_outsize_ = InputOutSizeFrom<GType>(inputs, *scope);
+    }
+
     out_ = OutFrom<GType>(outputs, *scope);
-    out_h_ = GetAttr<int>("out_h", attrs);
-    out_w_ = GetAttr<int>("out_w", attrs);
+
+    if (HasAttr("out_h", attrs)) {
+      out_h_ = GetAttr<int>("out_h", attrs);
+    } else if (HasAttr("out_h ", attrs)) {
+      // some models hurts ....   attr with space ..
+      out_h_ = GetAttr<int>("out_h ", attrs);
+    }
+
+    if (HasAttr("out_w", attrs)) {
+      out_w_ = GetAttr<int>("out_w", attrs);
+    } else if (HasAttr("out_w ", attrs)) {
+      // some models hurts ....   attr with space ..
+      out_w_ = GetAttr<int>("out_w ", attrs);
+    }
+
+    LOG(kLOG_DEBUG1) << "out_h_: " << out_h_;
+    LOG(kLOG_DEBUG1) << "out_w_: " << out_w_;
+
+    if (HasAttr("scale", attrs)) {
+      has_scale_ = true;
+      scale_ = GetAttr<float>("scale", attrs);
+    }
+    LOG(kLOG_DEBUG1) << "has_scale_:  " << has_scale_;
+    LOG(kLOG_DEBUG1) << "scale_:  " << scale_;
   }
   const GType *InputX() const { return input_x_; }
   const GType *InputOutPutSize() const { return input_outsize_; }
   GType *Out() const { return out_; }
   int OutH() const { return out_h_; }
   int OutW() const { return out_w_; }
+  float Scale() const { return scale_; }
+  bool HasScale() const { return has_scale_; }
 
  private:
   GType *input_x_;
@@ -3082,6 +3155,8 @@ class NearestInterpolationParam : public OpParam {
   GType *out_;
   int out_h_;
   int out_w_;
+  float scale_;
+  bool has_scale_;
 };
 #endif
 
@@ -3658,5 +3733,60 @@ class PixelShuffleParam : public OpParam {
 };
 #endif
 
+#ifdef GRID_SAMPLER_OP
+template <typename Dtype>
+class GridSamplerParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  GridSamplerParam(const VariableNameMap &inputs,
+                   const VariableNameMap &outputs, const AttributeMap &attrs,
+                   Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
+    grid_ = GridFrom<GType>(inputs, *scope);
+    output_ = OutputFrom<GType>(outputs, *scope);
+  }
+
+  const GType *InputX() const { return input_x_; }
+  const GType *Grid() const { return grid_; }
+
+  GType *Output() const { return output_; }
+
+ private:
+  GType *input_x_;
+  GType *grid_;
+  GType *output_;
+};
+#endif
+
+#ifdef EXPAND_OP
+template <typename Dtype>
+class ExpandParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  ExpandParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+              const AttributeMap &attrs, Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
+    out_ = OutFrom<GType>(outputs, *scope);
+    expand_times = OpParam::GetAttr<std::vector<int>>("expand_times", attrs);
+  }
+
+  const GType *InputX() const { return input_x_; }
+
+  GType *Out() const { return out_; }
+
+  std::vector<int> expand_times;
+
+ private:
+  GType *input_x_;
+  GType *out_;
+};
+
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/mobile/src/pass/memory_optimize_cl.cpp b/mobile/src/pass/memory_optimize_cl.cpp
index 355123349d645075fd2ccc37144144da7d332a8f..53bb675f17b2bae9c3954fa57894b8f73fc611fe 100644
--- a/mobile/src/pass/memory_optimize_cl.cpp
+++ b/mobile/src/pass/memory_optimize_cl.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #ifdef PADDLE_MOBILE_CL
 #include "pass/memory_optimize_cl.h"
 #include <algorithm>
+#include <utility>
 #include "framework/cl/cl_image.h"
 #include "framework/lod_tensor.h"
 namespace paddle_mobile {
@@ -79,7 +80,7 @@ void MemoryOptPassCl::operator()(
 
     std::vector<ClVarNode *> fetch_var_nodes;
     for (const auto &op : block->Ops()) {
-      DLOG << "op_desc->Type(): " << op->Type();
+      LOG(kNO_LOG) << "op_desc->Type(): " << op->Type();
       for (const auto &outputs : op->GetOutputs()) {
         for (const auto &output : outputs.second) {
           // not a persistable and not a exclude one ,then add it to
@@ -87,7 +88,7 @@ void MemoryOptPassCl::operator()(
           if (!IsPersistable(output) &&
               std::find(exclude_var_names.begin(), exclude_var_names.end(),
                         output) == exclude_var_names.end()) {
-            DLOG << "output: " << output;
+            LOG(kNO_LOG) << "output: " << output;
             ClVarNode *node = CreateNode(output);
             analysis_nodes_.push(node);
           }
@@ -100,7 +101,7 @@ void MemoryOptPassCl::operator()(
           if (!IsPersistable(input) &&
               std::find(exclude_var_names.begin(), exclude_var_names.end(),
                         input) == exclude_var_names.end()) {
-            DLOG << "input: " << input;
+            LOG(kNO_LOG) << "input: " << input;
             ClVarNode *node = CreateNode(input);
             analysis_nodes_.push(node);
             if (op->Type() == "fetch") {
@@ -114,7 +115,7 @@ void MemoryOptPassCl::operator()(
           if (!IsPersistable(output) &&
               std::find(exclude_var_names.begin(), exclude_var_names.end(),
                         output) == exclude_var_names.end()) {
-            DLOG << "output: " << output;
+            LOG(kNO_LOG) << "output: " << output;
             ClVarNode *node = CreateNode(output);
             analysis_nodes_.push(node);
           }
@@ -164,8 +165,8 @@ void MemoryOptPassCl::ShareData(
   cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
 
   for (const auto &list : reused_nodes_) {
-    DLOG << "\n";
-    DLOG << "gpu . share memory within these variables";
+    LOG(kNO_LOG) << "\n";
+    LOG(kNO_LOG) << "gpu . share memory within these variables";
     int64_t x_based_max_numl = -1;
     int64_t y_based_max_numl = -1;
     int64_t x_based_max_x = -1;
diff --git a/mobile/test/CMakeLists.txt b/mobile/test/CMakeLists.txt
index ccc609ff8300b9220285d73527145379edb30b5a..078440f45b0525ce49140ad78b2f9c23bb0f55f1 100644
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
@@ -45,7 +45,7 @@ if (CON GREATER -1)
     set(FOUND_MATCH ON)
 
     # gen test
-    ADD_EXECUTABLE(test-mobilenetgpu  net/test_mobilenet_GPU.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h)
     target_link_libraries(test-mobilenetgpu paddle-mobile)
 
 endif ()
@@ -105,7 +105,7 @@ if (CON GREATER -1)
 
     ADD_EXECUTABLE(test-marker-api fpga/test_marker_api.cpp)
     target_link_libraries(test-marker-api paddle-mobile)
-    
+
     #ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h )
     #target_link_libraries(test-marker2 paddle-mobile)
 
@@ -193,13 +193,16 @@ endif ()
 
 list(FIND NET "op" CON)
 if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h)
-    target_link_libraries(test-sigmoid paddle-mobile)
-
+    #    # gen test
+    #    ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h)
+    #    target_link_libraries(test-sigmoid paddle-mobile)
+    #
+    #    # gen test log
+    #    ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
+    #    target_link_libraries(test-leakyrelu paddle-mobile)
     # gen test log
-    ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
-    target_link_libraries(test-leakyrelu paddle-mobile)
+    ADD_EXECUTABLE(test-log common/test_log.cpp)
+    target_link_libraries(test-log paddle-mobile)
     set(FOUND_MATCH ON)
 endif ()
 
@@ -208,342 +211,361 @@ if (ENABLE_ALL_TEST)
         # gen test
         ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-resnet paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-squeezenet paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-yolo paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test_yolo_combined paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-op-in-net net/test_op_in_net.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-op-in-net paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-googlenet paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-googlenet-quali paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-conv-op operators/test_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-conv-op paddle-mobile)
-    
+
+        # gen test
+        ADD_EXECUTABLE(test-expend-op operators/test_expend_op.cpp test_helper.h test_include.h executor_for_test_opencl.h)
+        target_link_libraries(test-expend-op paddle-mobile)
+
         # gen test
         ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-mul-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-elementwiseadd-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-elementwisesub-op operators/test_elementwise_sub_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-elementwisesub-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-im2sequence-op operators/test_im2sequence_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-im2sequence-op paddle-mobile)
-    
-    	# gen test
+
+        # gen test
         ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-concat-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-lrn-op operators/test_lrn_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-lrn-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-batchnorm-op operators/test_batchnorm_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-batchnorm-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-priorbox-op operators/test_prior_box_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-priorbox-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-boxcoder-op operators/test_box_coder_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-boxcoder-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-transpose-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-transpose2-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-multiclassnms-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-polygon-box-transform-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-fill-constant-op operators/test_fill_constant_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-fill-constant-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-reshape-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-reshape2-op operators/test_reshape2_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-reshape2-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-relu-op paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-relu6-op operators/test_relu6_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-relu6-op paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-tanh-op operators/test_tanh_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-tanh-op paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-log-op operators/test_log_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-log-op paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-topk-op operators/test_topk_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-topk-op paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-cast-op operators/test_cast_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-cast-op paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-less-than-op operators/test_less_than_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-less-than-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-fc-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-sum-op operators/test_sum_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-sum-op paddle-mobile)
-    
+
         # test quantize op
         ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-quantize-op paddle-mobile)
-    
+
         # test dequantize op
         ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-dequantize-op paddle-mobile)
-    
+
         # gen test log
         ADD_EXECUTABLE(test-log common/test_log.cpp)
         target_link_libraries(test-log paddle-mobile)
-    
+
         # gen test log
         ADD_EXECUTABLE(test-load framework/test_load.cpp)
         target_link_libraries(test-load paddle-mobile)
-    
+
         # gen test log
         ADD_EXECUTABLE(test-loadmemory framework/test_load_memory.cpp)
         target_link_libraries(test-loadmemory paddle-mobile)
-    
+
         # gen test log
         ADD_EXECUTABLE(test-loadmemory-inference framework/test_load_memory_inference_api.cpp)
         target_link_libraries(test-loadmemory-inference paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
         target_link_libraries(test-inference-api paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
         target_link_libraries(test-optimize paddle-mobile)
-    
+
         #gen test
         ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-pool-op paddle-mobile)
-    
+
         #gen test
         ADD_EXECUTABLE(test-softmax-op operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-softmax-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
         target_link_libraries(test-gemm-accuracy paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-gemm-int8-accuracy common/test_gemm_int8_accuracy.cpp)
         target_link_libraries(test-gemm-int8-accuracy paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
         target_link_libraries(test-gemm-perf paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
         target_link_libraries(test-enforce paddle-mobile)
-    
+
         # gen test - test if openmp works
         ADD_EXECUTABLE(test-openmp common/test_openmp.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-openmp paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-mobilenetssd paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-mobilenet-combine paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-genet paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-sigmoid-op operators/test_sigmoid_op.cpp test_include.h)
         target_link_libraries(test-sigmoid-op paddle-mobile)
-    
+
         # gen test log
         ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
         target_link_libraries(test-leakyrelu paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-depthwise-conv-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-mobilenet paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-conv-add-relu-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-nlp paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-gru-op paddle-mobile)
-    
+
         # gen test
-    
+
         ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-inceptionv4 paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-alexnet paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h)
         target_link_libraries(test-googlenetv1 paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
         target_link_libraries(test-fssd paddle-mobile)
-    
+
         # gen test
-        ADD_EXECUTABLE(test-mobilenetgpu  net/test_mobilenet_GPU.cpp test_helper.h  test_include.h)
+        ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h)
         target_link_libraries(test-mobilenetgpu paddle-mobile)
-    
+
         # gen test
-        ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h  test_include.h executor_for_test.h)
+        ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-yologpu paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
         target_link_libraries(test-multi-process paddle-mobile)
-    
+
         # gen test benchmark
         ADD_EXECUTABLE(test-benchmark net/test_benchmark.cpp)
         target_link_libraries(test-benchmark paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
         target_link_libraries(test-eng paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h)
         target_link_libraries(test-super paddle-mobile)
-       
+
         # gen test
         ADD_EXECUTABLE(test-ocr net/test_ocr.cpp test_helper.h test_include.h)
         target_link_libraries(test-ocr paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-gesture net/test_gesture.cpp test_helper.h test_include.h)
         target_link_libraries(test-gesture paddle-mobile)
-      
-    
+
         ADD_EXECUTABLE(test-sequence-expand-op operators/test_sequence_expand_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-sequence-expand-op paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-sequence-pool-op operators/test_sequence_pool_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-sequence-pool-op paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-sequence-softmax-op operators/test_sequence_softmax_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-sequence-softmax-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-vgg16ssd net/test_vgg16ssd.cpp test_helper.h test_include.h)
         target_link_libraries(test-vgg16ssd paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-logical-and-op operators/test_logical_and_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-logical-and-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-logical-or-op operators/test_logical_or_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-logical-or-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-logical-not-op operators/test_logical_not_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-logical-not-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-logical-xor-op operators/test_logical_xor_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-logical-xor-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-increment-op operators/test_increment_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-increment-op paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-is-empty-op operators/test_is_empty_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-is-empty-op paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-conv-bn-relu-op operators/test_conv_bn_relu_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-conv-bn-relu-op paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-dwconv-bn-relu-op operators/test_dwconv_bn_relu_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-dwconv-bn-relu-op paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-conv-gpu operators/test_conv_gpu.cpp test_helper.h test_include.h)
         target_link_libraries(test-conv-gpu paddle-mobile)
-    
+
         ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
         target_link_libraries(test-net-benchmark paddle-mobile)
-    
+
         # gen test
         ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-net paddle-mobile)
 
+        # gen test
+        ADD_EXECUTABLE(test-net-feeds net/test_net_multi_feed.cpp test_helper.h test_include.h executor_for_test.h)
+        target_link_libraries(test-net-feeds paddle-mobile)
+
         # gen test
         ADD_EXECUTABLE(test-net-performance net/test_net_performance.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-net-performance paddle-mobile)
+
+        ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h)
+        target_link_libraries(test-inference-api-v2 paddle-mobile)
+
+        if (GPU_CL)
+            ADD_EXECUTABLE(test-net-male2fe net/test_mobilenet_male2fe.cpp test_helper.h test_include.h executor_for_test.h)
+            target_link_libraries(test-net-male2fe paddle-mobile)
+        endif()
+
     endif ()
-else()
+else ()
     # gen test
     ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-net paddle-mobile)
 
     ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
     target_link_libraries(test-net-benchmark paddle-mobile)
-endif()
+
+    ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-inference-api-v2 paddle-mobile)
+endif ()
diff --git a/mobile/test/common/test_log.cpp b/mobile/test/common/test_log.cpp
index 9efcf007d01b39d60be42736ec52ec382a5cd775..7ba964d18b4322d6c273bbcddd432d3e638efd22 100644
--- a/mobile/test/common/test_log.cpp
+++ b/mobile/test/common/test_log.cpp
@@ -15,10 +15,8 @@ limitations under the License. */
 #include "common/log.h"
 
 int main() {
-  DLOGF("DASJFDAFJ%d -- %f", 12345, 344.234);
-
-  LOGF(paddle_mobile::kLOG_DEBUG, "DASJFDAFJ%d -- %f", 12345, 344.234);
-
+  LOG(paddle_mobile::kLOG_DEBUG3) << "test debug"
+                                  << " next log";
   LOG(paddle_mobile::kLOG_DEBUG) << "test debug"
                                  << " next log";
 
@@ -26,9 +24,12 @@ int main() {
                                   << " next log";
   LOG(paddle_mobile::kLOG_DEBUG2) << "test debug2"
                                   << " next log";
+  LOG(paddle_mobile::kLOG_INFO) << "INFO!!!";
+  LOG(paddle_mobile::kLOG_WARNING) << "WARNING!!!";
+  LOG(paddle_mobile::kLOG_VERBOSE) << "VERBOSE!!!";
   DLOG << "test DLOG";
 
-  LOG(paddle_mobile::kLOG_ERROR) << " error occur !";
+  LOG(paddle_mobile::kLOG_ERROR) << "ERROR !";
 
   return 0;
 }
diff --git a/mobile/test/executor_for_test.h b/mobile/test/executor_for_test.h
index bcb5006084dff0713cde15acd90514f3facf5ce5..0a67eea5d5da3f8c7f155768640e7ba53b89abee 100644
--- a/mobile/test/executor_for_test.h
+++ b/mobile/test/executor_for_test.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
-
 #include "common/log.h"
 #include "framework/executor.h"
 #include "framework/op_registry.h"
@@ -74,8 +74,11 @@ class Executor4Test : public Executor<DeviceType> {
         break;
       }
     }
-
-    this->InitMemory();
+    if (this->program_.combined) {
+      this->InitCombineMemory();
+    } else {
+      this->InitMemory();
+    }
     for (const auto &op : this->ops_of_block0_) {
       op->Init();
     }
diff --git a/mobile/test/executor_for_test_opencl.h b/mobile/test/executor_for_test_opencl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a8af875928898135a55884df58e3067f146a4f2
--- /dev/null
+++ b/mobile/test/executor_for_test_opencl.h
@@ -0,0 +1,163 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_MOBILE_CL
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "./test_helper.h"
+#include "common/log.h"
+#include "framework/cl/cl_helper.h"
+#include "framework/cl/cl_tensor.h"
+#include "framework/executor.h"
+#include "framework/op_registry.h"
+#include "operators/feed_op.h"
+#include "operators/fetch_op.h"
+
+using paddle_mobile::framework::AttributeMap;
+using paddle_mobile::framework::BlockDesc;
+using paddle_mobile::framework::DDim;
+using paddle_mobile::framework::Executor;
+using paddle_mobile::framework::LoDTensor;
+using paddle_mobile::framework::OpDesc;
+using paddle_mobile::framework::OperatorBase;
+using paddle_mobile::framework::Program;
+using paddle_mobile::framework::Tensor;
+using paddle_mobile::framework::Variable;
+using std::string;
+using std::vector;
+namespace paddle_mobile {
+template <typename OpType>
+class OpenClOpTester {
+ public:
+  OpenClOpTester() {
+    framework::CLEngine::Instance()->setClPath("/data/local/tmp/bin");
+    scope_ = std::make_shared<paddle_mobile::framework::Scope>();
+    feed_clhelper_ = framework::CLHelper(scope_->GetCLScpoe());
+    fetch_clhelper_ = framework::CLHelper(scope_->GetCLScpoe());
+    this->feed_clhelper_.AddKernel("feed", "feed_kernel.cl");
+    this->fetch_clhelper_.AddKernel("fetch", "fetch_kernel.cl");
+
+    feed_var = scope_.get()->Var("feed");
+    fetch_var = scope_.get()->Var("fetch");
+    op_in_var = scope_.get()->Var("op_in");
+    op_out_var = scope_.get()->Var("op_out");
+  }
+
+  void Predict(string op_type, DDim feed_dims, DDim fetch_dims,
+               VariableNameMap inputs_feed, VariableNameMap outputs_feed,
+               AttributeMap attrs_feed) {
+    framework::CLImage *const op_in_cl_image =
+        op_in_var->template GetMutable<framework::CLImage>();
+    op_in_cl_image->Resize(feed_dims);
+    op_in_cl_image->InitEmptyImage(feed_clhelper_.CLContext(),
+                                   feed_clhelper_.CLCommandQueue(), feed_dims);
+    framework::CLImage *const op_out_cl_image =
+        op_out_var->template GetMutable<framework::CLImage>();
+    op_out_cl_image->Resize(fetch_dims);
+    framework::CLScope *const clScpoe = scope_->GetCLScpoe();
+    op_out_cl_image->InitEmptyImage(clScpoe->Context(), clScpoe->CommandQueue(),
+                                    fetch_dims);
+
+    Feed(feed_dims);
+    auto *op = new OpType(op_type, inputs_feed, outputs_feed, attrs_feed,
+                          scope_.get());
+    op->InferShape();
+    op->Init();
+    op->Run();
+    Fetch(fetch_dims);
+  }
+  void Feed(DDim feed_dims) {
+    auto *feed_var = scope_->Var("feed");
+    auto *_var = scope_->Var("op_in");
+    auto *const input = feed_var->template GetMutable<framework::LoDTensor>();
+    DLOG << "feed_dims: " << feed_dims;
+    SetupTensor<float>(input, feed_dims, -100.0, 100.0);
+    framework::CLImage *const op_in_cl_image =
+        op_in_var->template GetMutable<framework::CLImage>();
+    DLOG << "FeedKernel run ";
+    DLOG << "params.input " << *input;
+    DLOG << "params.op_in_cl_image " << *op_in_cl_image;
+    auto kernel = this->feed_clhelper_.KernelAt(0);
+    DLOG << "kernel get success ";
+
+    auto default_work_size =
+        this->feed_clhelper_.DefaultWorkSize(*(op_in_cl_image));
+
+    DLOG << "op_in_cl_image: " << *op_in_cl_image;
+    DLOG << "default_work_size: " << default_work_size;
+    cl_int status;
+    int numel = input->numel();
+    cl_mem output_image = op_in_cl_image->GetCLImage();
+    const int out_C = op_in_cl_image->dims()[1];
+    const int out_H = op_in_cl_image->dims()[2];
+    const int out_W = op_in_cl_image->dims()[3];
+    const int Stride2 = out_C * out_H * out_W;
+    const int Stride1 = out_H * out_W;
+    const int Stride0 = out_W;
+    framework::CLTensor input_cl_tensor(this->feed_clhelper_.CLContext(),
+                                        this->feed_clhelper_.CLCommandQueue());
+    input_cl_tensor.Resize(input->dims());
+    cl_mem inputBuffer;
+
+    inputBuffer =
+        input_cl_tensor.mutable_with_data<float>(input->data<float>());
+
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
+    CL_CHECK_ERRORS(status);
+
+    status = clEnqueueNDRangeKernel(
+        this->feed_clhelper_.CLCommandQueue(), kernel, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+
+    CL_CHECK_ERRORS(status);
+
+    DLOG << "*op_in_cl_image: " << *op_in_cl_image;
+  }
+
+  void Fetch(DDim fetch_dims) {
+    DLOG << "------------------  Fetch op ---------------------";
+
+    DLOG << "------------------  Fetch op end ---------------------";
+  }
+
+ private:
+  std::shared_ptr<paddle_mobile::framework::Scope> scope_;
+  framework::CLHelper feed_clhelper_;
+  framework::CLHelper fetch_clhelper_;
+
+  Variable *feed_var;
+  Variable *fetch_var;
+  Variable *op_in_var;
+  Variable *op_out_var;
+};
+}  // namespace paddle_mobile
+#endif
diff --git a/mobile/test/net/test_inference_api_v2.cpp b/mobile/test/net/test_inference_api_v2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76997bcb8f7f8fdd9f96f6c8b403006823c4724b
--- /dev/null
+++ b/mobile/test/net/test_inference_api_v2.cpp
@@ -0,0 +1,129 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "io/paddle_inference_api.h"
+
+using namespace paddle_mobile;  // NOLINT
+
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kGPU_CL;
+  config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST;
+
+  config.prog_file = "../models/ercy/model";
+  config.param_file = "../models/ercy/params";
+  config.lod_mode = false;
+  config.load_when_predict = false;
+  return config;
+}
+
+int main() {
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+
+  // reliable
+  int re_len = 1 * 1 * 64 * 72;
+  std::vector<float> re_v;
+  std::vector<int64_t> re_dims{1, 1, 64, 72};
+  GetInput<float>(g_test_image_1x3x224x224, &re_v, re_dims);
+
+  PaddleTensor re;
+  re.shape = std::vector<int>({1, 1, 64, 72});
+  re.data = PaddleBuf(re_v.data(), re_len * sizeof(float));
+  re.dtype = PaddleDType::FLOAT32;
+  re.layout = LayoutType::LAYOUT_CHW;
+
+  // grid
+  int grid_len = 1 * 64 * 72 * 2;
+  std::vector<float> grid_v;
+  std::vector<int64_t> grid_dims{1, 64, 72, 2};
+  GetInput<float>(g_test_image_1x3x224x224, &grid_v, grid_dims);
+
+  PaddleTensor grid;
+  grid.shape = std::vector<int>({1, 64, 72, 2});
+  grid.data = PaddleBuf(grid_v.data(), grid_len * sizeof(float));
+  grid.dtype = PaddleDType::FLOAT32;
+  grid.layout = LayoutType::LAYOUT_CHW;
+
+  // last_input
+  int last_len = 1 * 128 * 64 * 72;
+  std::vector<float> last_v;
+  std::vector<int64_t> last_dims{1, 128, 64, 72};
+  GetInput<float>(g_test_image_1x3x224x224, &last_v, last_dims);
+
+  PaddleTensor last;
+  last.shape = std::vector<int>({1, 128, 64, 72});
+  last.data = PaddleBuf(last_v.data(), last_len * sizeof(float));
+  last.dtype = PaddleDType::FLOAT32;
+  last.layout = LayoutType::LAYOUT_CHW;
+
+  // input_rgb
+  int input_rgb_len = 1 * 4 * 256 * 288;
+  std::vector<float> input_rgb_v;
+  std::vector<int64_t> input_rgb_dims{1, 4, 256, 288};
+  GetInput<float>(g_test_image_1x3x224x224, &input_rgb_v, input_rgb_dims);
+
+  PaddleTensor input_rgb;
+  input_rgb.shape = std::vector<int>({1, 4, 256, 288});
+  input_rgb.data = PaddleBuf(input_rgb_v.data(), input_rgb_len * sizeof(float));
+  input_rgb.dtype = PaddleDType::FLOAT32;
+  input_rgb.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output0;
+  output0.shape = std::vector<int>({});
+  output0.data = PaddleBuf();
+  output0.dtype = PaddleDType::FLOAT32;
+  output0.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output1;
+  output1.shape = std::vector<int>({});
+  output1.data = PaddleBuf();
+  output1.dtype = PaddleDType::FLOAT32;
+  output1.layout = LayoutType::LAYOUT_CHW;
+
+  predictor->Feed("reliable", re);
+  predictor->Feed("grid", grid);
+  predictor->Feed("last_input", last);
+  predictor->Feed("input_rgb", input_rgb);
+  predictor->Run();
+  predictor->Fetch("save_infer_model/scale_0", &output0);
+  predictor->Fetch("save_infer_model/scale_1", &output1);
+
+  float* out_ptr0 = reinterpret_cast<float*>(output0.data.data());
+  float* out_ptr1 = reinterpret_cast<float*>(output1.data.data());
+  std::cout << " print output0 : " << std::endl;
+  int numel = output0.data.length() / sizeof(float);
+  int stride = numel / 20;
+  stride = stride > 0 ? stride : 1;
+  for (size_t j = 0; j < numel; j += stride) {
+    std::cout << out_ptr0[j] << " ";
+  }
+  std::cout << std::endl;
+
+  std::cout << " print output1 : " << std::endl;
+  numel = output1.data.length() / sizeof(float);
+  stride = numel / 20;
+  stride = stride > 0 ? stride : 1;
+  for (size_t j = 0; j < numel; j += stride) {
+    std::cout << out_ptr1[j] << " ";
+  }
+  std::cout << std::endl;
+
+  return 0;
+}
diff --git a/mobile/test/net/test_mobilenet_male2fe.cpp b/mobile/test/net/test_mobilenet_male2fe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eb83b5bafe73a52c88a2408715eb4ffd2dff4676
--- /dev/null
+++ b/mobile/test/net/test_mobilenet_male2fe.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../../src/common/types.h"
+#include "../test_helper.h"
+#include "../test_include.h"
+
+void feed(PaddleMobile<paddle_mobile::GPU_CL> *paddle_mobile, const DDim &dims,
+          std::string image_path, std::string feed_name) {
+  float *input_data_array = new float[product(dims)];
+  std::ifstream in(image_path, std::ios::in);
+  for (int i = 0; i < product(dims); i++) {
+    float num;
+    in >> num;
+    input_data_array[i] = num;
+  }
+  in.close();
+  framework::Tensor input_tensor(input_data_array, dims);
+  DLOG << feed_name << " : " << input_tensor;
+  paddle_mobile->Feed(feed_name, input_tensor);
+}
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
+  auto time1 = paddle_mobile::time();
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#endif
+
+  if (paddle_mobile.Load(std::string("../models/nanbiannv") + "/model",
+                         std::string("../models/nanbiannv") + "/params",
+                         true)) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+
+    std::vector<float> input;
+    feed(&paddle_mobile, {1, 3, 256, 256}, "../images/input_1_3_256_256",
+         "image");
+
+    auto time3 = paddle_mobile::time();
+    paddle_mobile.Predict();
+    auto time4 = paddle_mobile::time();
+
+    std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4)
+              << "ms" << std::endl;
+  }
+
+  auto rgb = paddle_mobile.Fetch("rgb");
+  auto mask = paddle_mobile.Fetch("mask");
+  LOG(kLOG_INFO) << "rgb" << *rgb;
+  LOG(kLOG_INFO) << "mask" << *mask;
+  return 0;
+}
diff --git a/mobile/test/net/test_net_multi_feed.cpp b/mobile/test/net/test_net_multi_feed.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5c04a76ad31928a1f89cfaa35b708b5291401481
--- /dev/null
+++ b/mobile/test/net/test_net_multi_feed.cpp
@@ -0,0 +1,221 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_MOBILE_CL
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+void test(int argc, char *argv[]);
+
+void feed(PaddleMobile<paddle_mobile::GPU_CL> *paddle_mobile, const DDim &dims,
+          std::string feed_name) {
+  float *input_data_array = new float[product(dims)];
+  std::ifstream in(feed_name, std::ios::in);
+  for (int i = 0; i < product(dims); i++) {
+    float num;
+    in >> num;
+    input_data_array[i] = num;
+  }
+  in.close();
+  framework::Tensor input_tensor(input_data_array, dims);
+  DLOG << feed_name << " : " << input_tensor;
+  paddle_mobile->Feed(feed_name, input_tensor);
+}
+int main(int argc, char *argv[]) {
+  test(argc, argv);
+  return 0;
+}
+
+void test(int argc, char *argv[]) {
+  int arg_index = 1;
+  bool fuse = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  bool quantification = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  int quantification_fold = std::stoi(argv[arg_index]);
+  arg_index++;
+  paddle_mobile::PaddleMobileConfigInternal config;
+  config.memory_optimization_level = enable_memory_optimization
+                                         ? MemoryOptimizationWithoutFeeds
+                                         : NoMemoryOptimization;
+
+#ifdef PADDLE_MOBILE_CL
+  //  config.load_when_predict = true;
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+  std::cout << "testing opencl yyz " << std::endl;
+#else
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
+  paddle_mobile.SetThreadNum(1);
+  std::cout << "testing cpu yyz " << std::endl;
+#endif
+
+  int dim_count = std::stoi(argv[arg_index]);
+  arg_index++;
+  int size = 1;
+
+  arg_index += dim_count;
+
+  bool is_lod = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  paddle_mobile::framework::LoD lod{{}};
+  if (is_lod) {
+    int lod_count = std::stoi(argv[arg_index]);
+    arg_index++;
+    for (int i = 0; i < lod_count; i++) {
+      int dim = std::stoi(argv[arg_index + i]);
+      lod[0].push_back(dim);
+    }
+    arg_index += lod_count;
+  }
+
+  int var_count = std::stoi(argv[arg_index]);
+  arg_index++;
+  bool is_sample_step = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  int sample_arg = std::stoi(argv[arg_index]);
+  int sample_step = sample_arg;
+  int sample_num = sample_arg;
+  arg_index++;
+  std::vector<std::string> var_names;
+  for (int i = 0; i < var_count; i++) {
+    std::string var_name = argv[arg_index + i];
+    var_names.push_back(var_name);
+  }
+  arg_index += var_count;
+  bool check_shape = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+
+  auto time1 = time();
+  if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
+                         fuse, quantification, 1, is_lod,
+                         quantification_fold)) {
+    auto time2 = time();
+    std::cout << "auto-test"
+              << " load-time-cost :" << time_diff(time1, time2) << "ms"
+              << std::endl;
+
+    feed(&paddle_mobile, {1, 4, 256, 288}, "input_rgb");
+    feed(&paddle_mobile, {1, 128, 64, 72}, "last_input");
+    feed(&paddle_mobile, {1, 64, 72, 2}, "grid");
+    feed(&paddle_mobile, {1, 1, 64, 72}, "reliable");
+    paddle_mobile.Predict();
+
+#ifdef PADDLE_MOBILE_CL
+    for (auto var_name : var_names) {
+      auto cl_image = paddle_mobile.FetchImage(var_name);
+      if (cl_image == nullptr || cl_image->GetCLImage() == nullptr) {
+        continue;
+      }
+      auto len = cl_image->numel();
+      if (len == 0) {
+        continue;
+      }
+      size_t width = cl_image->ImageDims()[0];
+      size_t height = cl_image->ImageDims()[1];
+      paddle_mobile::framework::half_t *image_data =
+          new paddle_mobile::framework::half_t[height * width * 4];
+      cl_int err;
+      cl_mem image = cl_image->GetCLImage();
+      size_t origin[3] = {0, 0, 0};
+      size_t region[3] = {width, height, 1};
+      err = clEnqueueReadImage(cl_image->CommandQueue(), image, CL_TRUE, origin,
+                               region, 0, 0, image_data, 0, NULL, NULL);
+      CL_CHECK_ERRORS(err);
+      float *tensor_data = new float[cl_image->numel()];
+      auto converter = cl_image->Converter();
+      converter->ImageToNCHW(image_data, tensor_data, cl_image->ImageDims(),
+                             cl_image->dims());
+
+      auto data = tensor_data;
+      std::string sample = "";
+      if (check_shape) {
+        for (int i = 0; i < cl_image->dims().size(); i++) {
+          sample += " " + std::to_string(cl_image->dims()[i]);
+        }
+      }
+      if (!is_sample_step) {
+        sample_step = len / sample_num;
+      }
+      if (sample_step <= 0) {
+        sample_step = 1;
+      }
+      for (int i = 0; i < len; i += sample_step) {
+        sample += " " + std::to_string(data[i]);
+      }
+      std::cout << "auto-test"
+                << " var " << var_name << sample << std::endl;
+    }
+#else
+    for (auto var_name : var_names) {
+      auto out = paddle_mobile.Fetch(var_name);
+      auto len = out->numel();
+      if (len == 0) {
+        continue;
+      }
+      if (out->memory_size() == 0) {
+        continue;
+      }
+      if (out->type() == type_id<int>()) {
+        auto data = out->data<int>();
+        std::string sample = "";
+        if (check_shape) {
+          for (int i = 0; i < out->dims().size(); i++) {
+            sample += " " + std::to_string(out->dims()[i]);
+          }
+        }
+        if (!is_sample_step) {
+          sample_step = len / sample_num;
+        }
+        if (sample_step <= 0) {
+          sample_step = 1;
+        }
+        for (int i = 0; i < len; i += sample_step) {
+          sample += " " + std::to_string(data[i]);
+        }
+        std::cout << "auto-test"
+                  << " var " << var_name << sample << std::endl;
+      } else if (out->type() == type_id<float>()) {
+        auto data = out->data<float>();
+        std::string sample = "";
+        if (check_shape) {
+          for (int i = 0; i < out->dims().size(); i++) {
+            sample += " " + std::to_string(out->dims()[i]);
+          }
+        }
+        if (!is_sample_step) {
+          sample_step = len / sample_num;
+        }
+        if (sample_step <= 0) {
+          sample_step = 1;
+        }
+        for (int i = 0; i < len; i += sample_step) {
+          sample += " " + std::to_string(data[i]);
+        }
+        std::cout << "auto-test"
+                  << " var " << var_name << sample << std::endl;
+      }
+    }
+#endif
+    std::cout << std::endl;
+  }
+}
+#else
+int main() {}
+#endif
diff --git a/mobile/test/operators/test_expend_op.cpp b/mobile/test/operators/test_expend_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cbe307ac696b1ced89fdc644590f6a83cb56b644
--- /dev/null
+++ b/mobile/test/operators/test_expend_op.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_MOBILE_CL
+#include "../executor_for_test_opencl.h"
+#include "operators/expand_op.h"
+#include "operators/feed_op.h"
+#ifdef EXPAND_OP
+
+int main() {
+  const int IN_N = 1;
+  const int IN_C = 1;
+  const int IN_H = 2;
+  const int IN_W = 3;
+
+  const int EXPEND_N = 1;
+  const int EXPEND_C = 1;
+  const int EXPEND_H = 2;
+  const int EXPEND_W = 2;
+
+  const int OUT_N = IN_N * EXPEND_N;
+  const int OUT_C = IN_C * EXPEND_C;
+  const int OUT_H = IN_H * EXPEND_H;
+  const int OUT_W = IN_W * EXPEND_W;
+
+  framework::DDim in_dims = framework::make_ddim({IN_N, IN_C, IN_H, IN_W});
+  framework::DDim out_dims = framework::make_ddim({OUT_N, OUT_C, OUT_H, OUT_W});
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  AttributeMap attrs;
+  inputs["X"] = std::vector<std::string>({"op_in"});
+  outputs["Out"] = std::vector<std::string>({"op_out"});
+
+  std::vector<int> expand_times = {EXPEND_N, EXPEND_C, EXPEND_H, EXPEND_W};
+  attrs["expand_times"].Set<std::vector<int>>(expand_times);
+
+  OpenClOpTester<operators::ExpandOp<GPU_CL, float>> tester;
+  tester.Predict("expend", in_dims, out_dims, inputs, outputs, attrs);
+}
+#endif
+
+#else
+int main() {}
+#endif
diff --git a/mobile/tools/op.cmake b/mobile/tools/op.cmake
index 923380940aa10147d65e374265c1073ec37cb11e..cd84b9cbde2252e2947418c5d6f02ea0097f1527 100755
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake
@@ -273,8 +273,9 @@ endif()
 list(FIND NET "op" CON)
 if (CON GREATER -1)
   message("op enabled")
-  set(SIGMOID_OP ON)
-  set(LEAKY_RELU_OP ON)
+#  set(SIGMOID_OP ON)
+#  set(LEAKY_RELU_OP ON)
+  set(BLOG ON)
   set(FOUND_MATCH ON)
 endif()
 
@@ -379,6 +380,8 @@ if(NOT FOUND_MATCH)
   set(REDUCE_PROD_OP ON)
   set(FUSION_INSTANCENORM_RELU_OP ON)
   set(PIXEL_SHUFFLE_OP ON)
+  set(EXPAND_OP ON)
+  set(GRID_SAMPLER_OP ON)
 endif()
 
   # option(BATCHNORM_OP "" ON)
@@ -755,3 +758,13 @@ endif()
 if (PIXEL_SHUFFLE_OP)
   add_definitions(-DPIXEL_SHUFFLE_OP)
 endif()
+if (EXPAND_OP)
+  add_definitions(-DEXPAND_OP)
+endif()
+if (GRID_SAMPLER_OP)
+  add_definitions(-DGRID_SAMPLER_OP)
+endif()
+if (BLOG)
+  add_definitions(-DBLOG)
+endif()
+
diff --git a/mobile/tools/python/fluidtools/.gitignore b/mobile/tools/python/fluidtools/.gitignore
index d80b4c87b2a1250a5a91963a0d1918b099311183..a8dcab2592cd52969689765650ebc45dfd4c9c96 100644
--- a/mobile/tools/python/fluidtools/.gitignore
+++ b/mobile/tools/python/fluidtools/.gitignore
@@ -3,3 +3,4 @@
 !.gitignore
 !/model-encrypt-tool
 !test_wrap.py
+!run_multi_feed.py
diff --git a/mobile/tools/python/fluidtools/run_multi_feed.py b/mobile/tools/python/fluidtools/run_multi_feed.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f706a2e22c26d2eab91c9bec8cc1220a9439a83
--- /dev/null
+++ b/mobile/tools/python/fluidtools/run_multi_feed.py
@@ -0,0 +1,695 @@
+# -*- coding: utf-8 -*
+import os
+import sys
+import math
+import subprocess
+import numpy as np
+import paddle.fluid as fluid
+
+model_path = "erciyuan"
+checked_model_path = "checked_model"
+feed_path = "feeds"
+output_path = "outputs"
+diff_threshold = 0.1
+is_lod = False
+mobile_model_path = ""
+fast_check = False
+is_sample_step = False
+sample_step = 1
+sample_num = 20
+need_encrypt = False
+checked_encrypt_model_path = "checked_encrypt_model"
+output_var_filter = []
+output_key_filter = {}
+check_shape = False
+quantification = False
+quantification_fold = 1000
+architecture = "arm-v7a"
+# architecture = "arm-v8a"
+correct_persistable = False
+
+np.set_printoptions(linewidth=150)
+
+mobile_exec_root = "/data/local/tmp/bin"
+mobile_src_root = os.path.abspath("../../../")
+if mobile_src_root.endswith("/"):
+    mobile_src_root = mobile_src_root[:-1]
+
+dot = "•"
+black = lambda x: "\033[30m" + str(x) + "\033[0m"
+red = lambda x: "\033[31m" + str(x) + "\033[0m"
+green = lambda x: "\033[32m" + str(x) + "\033[0m"
+yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
+reset = lambda x: "\033[0m" + str(x)
+feed_names_ = []
+
+def pp_tab(x, level=0):
+    header = ""
+    for i in range(0, level):
+        header += "\t"
+    print(header + str(x))
+def pp_black(x, level=0):
+    pp_tab(black(x) + reset(""), level)
+def pp_red(x, level=0):
+    pp_tab(red(x) + reset(""), level)
+def pp_green(x, level=0):
+    pp_tab(green(x) + reset(""), level)
+def pp_yellow(x, level=0):
+    pp_tab(yellow(x) + reset(""), level)
+
+def sh(command):
+    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    return pipe.stdout.read().decode("utf-8")
+def push(src, dest=""):
+    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
+
+pp_yellow(dot + " start inspecting fluid model")
+
+exe = fluid.Executor(fluid.CPUPlace())
+exe.run(fluid.default_startup_program())
+
+# 加载模型
+def load_model(model_path):
+    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
+    global correct_persistable
+    if correct_persistable:
+        ops = prog.current_block().ops
+        vars = prog.current_block().vars
+        for op in ops:
+            for var_name in op.output_arg_names:
+                if var_name == "fetch":
+                    continue
+                var = vars[var_name]
+                if var.persistable:
+                    pp_red("has found non-persistable output var : {}".format(var_name))
+                    var.persistable = False
+    return (prog, feeds, fetches)
+
+prog, feeds, fetches = load_model(model_path)
+
+# 强制要求所有张量的形状，在model和params中一致，并重新保存模型
+def resave_model(feed_kv):
+    if len(mobile_model_path) > 0:
+        pp_green("has set mobile_model_path, stop checking model & params", 1)
+        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
+        return
+    ops = prog.current_block().ops
+    vars = prog.current_block().vars
+    # 强制所有var为可持久化
+    p_names = []
+    for name in vars:
+        name = str(name)
+        v = fluid.framework._get_var(name, prog)
+        if not v.persistable:
+            v.persistable = True
+            p_names.append(name)
+    outputs = run_model(feed_kv=feed_kv)
+    has_found_wrong_shape = False
+    # 修正每个var的形状
+    for name in vars:
+        name = str(name)
+        v = vars[name]
+        if v.persistable:
+            v1 = fluid.global_scope().find_var(name)
+            try:
+                t1 = v1.get_tensor()
+                shape = t1.shape()
+            except:
+                continue
+            if v.desc.shape() != shape:
+                has_found_wrong_shape = True
+            v.desc.set_shape(shape)
+    # 恢复var的可持久化属性
+    for name in p_names:
+        v = fluid.framework._get_var(name, prog)
+        v.persistable = False
+    if not quantification:
+        fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
+    if has_found_wrong_shape:
+        pp_red("has found wrong shape", 1)
+    else:
+        pp_green("has not found wrong shape", 1)
+    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
+
+# 分别加密model和params，加密key使用同一个
+def encrypt_model():
+    if not need_encrypt:
+        return
+    pp_yellow(dot + dot + " encrypting model")
+    if not os.path.exists(checked_encrypt_model_path):
+        os.mkdir(checked_encrypt_model_path)
+    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
+    lines = res.split("\n")
+
+    for line in lines:
+        if line.startswith("key:"):
+            line = line.replace('key:','')
+            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/model -o "
+               "checked_model/model.ml".format(line))
+            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/params  -o checked_model/params.ml".format(line))
+            pp_green("model has been encrypted, key is : {}".format(line), 1)
+            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
+            return
+    pp_red("model encrypt error", 1)
+
+# 生成feed的key-value对
+def gen_feed_kv():
+    feed_kv = {}
+    for feed_name in feeds:
+        feed_shape = get_feed_var_shape(feed_name)
+        data = np.random.random(feed_shape).astype("float32")
+        feed_kv[feed_name] = data
+    return feed_kv
+
+# 保存feed的key-value对
+def save_feed_kv(feed_kv):
+    for feed_name in feed_kv:
+        feed_data = feed_kv[feed_name]
+        feed_list = feed_data.flatten().tolist()
+        if not os.path.exists(feed_path):
+            os.mkdir(feed_path)
+        file_name = feed_name.replace("/", "_")
+        out_file = open(feed_path + "/" + file_name, "w")
+        for feed_item in feed_list:
+            out_file.write("{}\n".format(feed_item))
+        out_file.close()
+
+last_feed_var_name = None
+last_feed_file_name = None
+last_feed_var_lod = None
+# 加载feed的key-value对
+def load_feed_kv():
+    if not os.path.exists(feed_path):
+        return None
+    global last_feed_var_name
+    global last_feed_file_name
+    global last_feed_var_lod
+    feed_kv = {}
+    pp_yellow(dot + dot + " checking feed info")
+    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
+    for feed_name in feeds:
+        feed_shape = get_feed_var_shape(feed_name)
+        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
+        file_name = feed_name.replace("/", "_")
+        last_feed_var_name = feed_name
+        last_feed_file_name = file_name
+        feed_file_path = feed_path + "/" + file_name
+        if not os.path.exists(feed_file_path):
+            return None
+        data = np.loadtxt(feed_file_path)
+        expected_len = 1
+        for dim in feed_shape:
+            expected_len *= dim
+        if len(np.atleast_1d(data)) != expected_len:
+            return None
+        data = data.reshape(feed_shape).astype("float32")
+        
+        if is_lod:
+            data_shape = [1]
+            for dim in feed_shape:
+                data_shape.append(dim)
+            data = data.reshape(data_shape).astype("float32")
+            tensor = fluid.LoDTensor()
+            seq_lens = [len(seq) for seq in data]
+            cur_len = 0
+            lod = [cur_len]
+            for l in seq_lens:
+                cur_len += l
+                lod.append(cur_len)
+            data = data.reshape(feed_shape)
+            tensor.set(data, fluid.CPUPlace())
+            tensor.set_lod([lod])
+            last_feed_var_lod = lod
+            feed_kv[feed_name] = tensor
+        else:
+            feed_kv[feed_name] = data
+    return feed_kv
+
+# 运行模型
+def run_model(feed_kv=None):
+    pp_yellow("run_model", 1)
+    if feed_kv is None:
+        feed_kv = gen_feed_kv()
+
+    feed_names_.clear()
+    for feed_name in feeds:
+        feed_names_.append(feed_name)
+        pp_green(feed_name, 1)
+
+
+    pp_green(feed_names_, 1)
+
+    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
+    results = []
+    for output in outputs:
+        results.append(np.array(output))
+    return results
+
+# 获取变量形状
+def get_var_shape(var_name):
+    vars = prog.current_block().vars
+    shape = vars[var_name].desc.shape()
+    for i in range(len(shape)):
+        dim = shape[i]
+        if dim == -1:
+            shape[i] = 1
+    return shape
+
+# 获取输入变量形状
+def get_feed_var_shape(var_name):
+    # 如果想写死输入形状，放开以下语句
+    # return [1, 3, 224, 224]
+    return get_var_shape(var_name)
+
+persistable_cache = []
+# 所有var，全部变成持久化
+def force_all_vars_to_persistable():
+    global persistable_cache
+    for var_name in vars.keys():
+        var_name = str(var_name)
+        v = fluid.framework._get_var(var_name, prog)
+        persistable = v.persistable
+        if not persistable:
+            persistable_cache.append(var_name)
+            v.persistable = True
+
+# 恢复持久化属性
+def restore_all_vars_persistable():
+    global persistable_cache
+    for var_name in vars.keys():
+        var_name = str(var_name)
+        v = fluid.framework._get_var(var_name, prog)
+        persistable = v.persistable
+        if var_name in persistable_cache:
+            v.persistable = False
+    persistable_cache = []
+
+# 获取var的数据
+def get_var_data(var_name, feed_kv=None):
+    output = np.array(fluid.global_scope().var(var_name).get_tensor())
+    return output
+
+output_var_cache = {}
+def tensor_sample(tensor):
+    if is_sample_step:
+        step = sample_step
+    else:
+        step = math.floor(len(tensor) / sample_num)
+    step = max(step, 1)
+    step = int(step)
+    sample = []
+    for i in range(0, len(tensor), step):
+        sample.append(tensor[i])
+    return sample
+
+op_cache = {}
+# 获取每层输出的数据
+def save_all_op_output(feed_kv=None):
+    force_all_vars_to_persistable()
+    outputs = run_model(feed_kv=feed_kv)
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+    ops = prog.current_block().ops
+    fetch_names = []
+    for fetch in fetches:
+        fetch_names.append(fetch.name)
+    feed_names = feeds
+    if len(output_var_filter) > 0:
+        for fetch_name in fetch_names:
+            output_var_filter.append(fetch_name)
+    for i in range(len(ops)):
+        op = ops[i]
+        var_name = None
+        var_name_index = -1
+        for index in range(len(op.output_names)):
+            if op.output_names[index] in ["Y", "Out", "Output"]:
+                var_name_index = index
+                break
+        if var_name_index != -1:
+            var_name = op.output_arg_names[var_name_index]
+        else:
+            for name in op.output_arg_names:
+                var_name = name
+                if "tmp" in name:
+                    break
+        if len(output_var_filter) > 0:
+            if var_name not in output_var_filter:
+                continue
+        # real_var_name = None
+        # if op.type == "fetch":
+        #     for name in op.input_arg_names:
+        #         real_var_name = name
+        #         if "tmp" in name:
+        #             break
+        # else:
+        #     real_var_name = var_name
+        if fast_check:
+            if var_name not in fetch_names and var_name not in feed_names:
+                continue
+        try:
+            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
+            sample = tensor_sample(data)
+            output_var_cache[var_name] = (sample)
+            op_cache[i] = (var_name, op)
+            file_name = var_name.replace("/", "_")
+            out_file = open(output_path + "/" + file_name, "w")
+            if var_name in feed_names:
+                for item in data:
+                    out_file.write("{}\n".format(item))
+            else:
+                for item in sample:
+                    out_file.write("{}\n".format(item))
+            out_file.close()
+        except:
+            pass
+    for i in range(len(ops)):
+        op = ops[i]
+        if op.type not in output_key_filter:
+            continue
+        var_name = None
+        var_name_index = -1
+        for index in range(len(op.output_names)):
+            if op.output_names[index] in output_key_filter[op.type]:
+                var_name_index = index
+                break
+        if var_name_index != -1:
+            var_name = op.output_arg_names[var_name_index]
+        else:
+            continue
+        if len(output_var_filter) > 0:
+            if var_name not in output_var_filter:
+                continue
+        # real_var_name = None
+        # if op.type == "fetch":
+        #     for name in op.input_arg_names:
+        #         real_var_name = name
+        #         if "tmp" in name:
+        #             break
+        # else:
+        #     real_var_name = var_name
+        if fast_check:
+            if var_name not in fetch_names and var_name not in feed_names:
+                continue
+        try:
+            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
+            sample = tensor_sample(data)
+            output_var_cache[var_name] = (sample)
+            op_cache[i] = (var_name, op)
+            file_name = var_name.replace("/", "_")
+            out_file = open(output_path + "/" + file_name, "w")
+            if var_name in feed_names:
+                for item in data:
+                    out_file.write("{}\n".format(item))
+            else:
+                for item in sample:
+                    out_file.write("{}\n".format(item))
+            out_file.close()
+        except:
+            pass
+    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
+    restore_all_vars_persistable()
+
+ops = prog.current_block().ops
+vars = prog.current_block().vars
+
+pp_yellow(dot + dot + " checking op list")
+op_types = set()
+for op in ops:
+    op_types.add(op.type)
+pp_tab("op types : {}".format(op_types), 1)
+
+def check_mobile_results(args, fuse, mem_opt):
+    args = "{} {} {} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", "1" if quantification else "0", quantification_fold, args)
+    pp_green(args, 1)
+    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net-feeds {}\"".format(mobile_exec_root, args))
+    lines = res.split("\n")
+    for line in lines:
+        print(line)
+    # for line in lines:
+    #     if line.startswith("auto-test-debug"):
+    #         print(line)
+    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
+    mobile_var_cache = {}
+    for line in lines:
+        parts = line.split(" ")
+        if len(parts) < 2:
+            continue
+        if "auto-test" != parts[0]:
+            continue
+        if parts[1] == "load-time-cost":
+            pp_green("load time cost : {}".format(parts[2]), 1) 
+        elif parts[1] == "predict-time-cost":
+            pp_green("predict time cost : {}".format(parts[2]), 1) 
+        elif parts[1] == "preprocess-time-cost":
+            pp_green("preprocess time cost : {}".format(parts[2]), 1)
+        elif parts[1] == "var":
+            var_name = parts[2]
+            values = list(map(lambda x: float(x), parts[3:]))
+            mobile_var_cache[var_name] = values
+    error_index = None
+    error_values1 = None
+    error_values2 = None
+    checked_names = []
+    fetch_names = []
+    for fetch in fetches:
+        fetch_names.append(fetch.name)
+    fetch_diff = 0.0
+    fetch_count = 0
+    for index in op_cache:
+        op_output_var_name, op = op_cache[index]
+        if not op_output_var_name in output_var_cache:
+            continue
+        if not op_output_var_name in mobile_var_cache:
+            continue
+        if op_output_var_name not in fetch_names:
+            continue
+        values1 = output_var_cache[op_output_var_name]
+        values2 = mobile_var_cache[op_output_var_name]
+        shape = get_var_shape(op_output_var_name) if check_shape else []
+        for i in range(len(values1)):
+            v1 = values1[i]
+            v2 = values2[len(shape) + i]
+            fetch_diff += abs(v1 - v2)
+            fetch_count += 1
+    if fetch_count != 0:
+        pp_yellow("output avg diff : {}".format(fetch_diff / fetch_count), 1)
+    for index in op_cache:
+        op_output_var_name, op = op_cache[index]
+        if mem_opt:
+            found_in_fetch = False
+            for fetch in fetches:
+                if op_output_var_name == fetch.name:
+                    found_in_fetch = True
+                    break
+            if not found_in_fetch:
+                continue
+        if not op_output_var_name in output_var_cache:
+            continue
+        if not op_output_var_name in mobile_var_cache:
+            continue
+        if op_output_var_name not in fetch_names:
+            continue
+        values1 = output_var_cache[op_output_var_name]
+        values2 = mobile_var_cache[op_output_var_name]
+        shape = get_var_shape(op_output_var_name) if check_shape else []
+        if len(values1) + len(shape) != len(values2):
+            error_index = index
+        for i in range(len(shape)):
+            v1 = shape[i]
+            v2 = values2[i]
+            if v1 != v2:
+                error_index = index
+                break
+        if error_index == None:
+            for i in range(len(values1)):
+                v1 = values1[i]
+                v2 = values2[len(shape) + i]
+                if abs(v1 - v2) > diff_threshold:
+                    error_index = index
+                    break
+        checked_names.append(op_output_var_name)
+        if error_index != None:
+            error_values1 = values1
+            error_values2 = values2
+            break
+    if error_index == None:
+        for name in fetch_names:
+            if name not in checked_names:
+                error_index = -1
+                break
+    if error_index == None:
+        pp_green("outputs are all correct", 1)
+    elif error_index == -1:
+        pp_red("outputs are missing")
+    else:
+        error_values1 = np.array(error_values1)
+        error_values2 = np.array(error_values2)
+        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
+        pp_red("outputs are incorrect", 1)
+        pp_red("fluid results are : ", 1)
+        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
+        pp_yellow("paddle mobile results are : ", 1)
+        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
+        if not fuse and not mem_opt:
+            pp_yellow("checking individual ops : ", 1)
+            error_index = None
+            error_values1 = None
+            error_values2 = None
+            checked_names = []
+            fetch_names = []
+            for fetch in fetches:
+                fetch_names.append(fetch.name)
+            for index in op_cache:
+                op_output_var_name, op = op_cache[index]
+                if mem_opt:
+                    found_in_fetch = False
+                    for fetch in fetches:
+                        if op_output_var_name == fetch.name:
+                            found_in_fetch = True
+                            break
+                    if not found_in_fetch:
+                        continue
+                if not op_output_var_name in output_var_cache:
+                    continue
+                if not op_output_var_name in mobile_var_cache:
+                    continue
+                if fuse or mem_opt:
+                    if op_output_var_name not in fetch_names:
+                        continue
+                values1 = output_var_cache[op_output_var_name]
+                values2 = mobile_var_cache[op_output_var_name]
+                shape = get_var_shape(op_output_var_name) if check_shape else []
+                if len(values1) + len(shape) != len(values2):
+                    error_index = index
+                for i in range(len(shape)):
+                    v1 = shape[i]
+                    v2 = values2[i]
+                    if v1 != v2:
+                        error_index = index
+                        break
+                if error_index == None:
+                    for i in range(len(values1)):
+                        v1 = values1[i]
+                        v2 = values2[len(shape) + i]
+                        if ((not math.isnan(v1)) and math.isnan(v2)) or abs(v1 - v2) > diff_threshold:
+                            error_index = index
+                            break
+                checked_names.append(op_output_var_name)
+                if error_index != None:
+                    error_values1 = values1
+                    error_values2 = values2
+                    break
+            if error_index == None:
+                for name in fetch_names:
+                    if name not in checked_names:
+                        error_index = -1
+                        break
+            if error_index == None:
+                pp_green("outputs are all correct", 1)
+            elif error_index == -1:
+                pp_red("outputs are missing")
+            else:
+                error_values1 = np.array(error_values1)
+                error_values2 = np.array(error_values2)
+                # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
+                pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
+                    error_index,op_cache[error_index][1].type,op_output_var_name), 1)
+                pp_red("fluid results are : ", 1)
+                pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
+                pp_yellow("paddle mobile results are : ", 1)
+                pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
+    # print(output_var_cache)
+    # print(mobile_var_cache)
+
+def main():
+    # 加载kv
+    feed_kv = load_feed_kv()
+    if feed_kv == None:
+        feed_kv = gen_feed_kv()
+        save_feed_kv(feed_kv)
+        feed_kv = load_feed_kv()
+    # 预测
+    pp_yellow(dot + dot + " checking inference")
+    outputs = run_model(feed_kv=feed_kv)
+    pp_tab("fluid output : {}".format(outputs), 1)
+    # 重新保存模型
+    pp_yellow(dot + dot + " checking model correctness")
+    resave_model(feed_kv=feed_kv)
+    # 输出加密模型
+    encrypt_model()
+    # 输出所有中间结果
+    pp_yellow(dot + dot + " checking output result of every op")
+    save_all_op_output(feed_kv=feed_kv)
+    pp_yellow(dot + dot + " checking fetch info")
+    for fetch in fetches:
+        fetch_name = fetch.name
+        fetch_shape = get_var_shape(fetch_name)
+        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
+    # 输出所有op、var信息
+    info_file = open("info.txt", "w")
+    for i in range(len(ops)):
+        op = ops[i]
+        info_file.write("{}th op: type - {}\n".format(i, op.type))
+        info_file.write("inputs:\n")
+        for var_name in op.input_arg_names:
+            try:
+                shape = get_var_shape(var_name)
+                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
+                info_file.write("var {} : {}\n".format(var_name, shape_str))
+            except:
+                pass
+        info_file.write("outputs:\n")
+        for var_name in op.output_arg_names:
+            try:
+                shape = get_var_shape(var_name)
+                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
+                info_file.write("var {} : {}\n".format(var_name, shape_str))
+            except:
+                pass
+    info_file.close()
+    # 开始检查mobile的正确性
+    print("")
+    print("==================================================")
+    print("")
+    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
+    push(checked_model_path)
+
+    pp_green(feed_names_, 1)
+    feed_names_argu = ""
+    for n in feed_names_:
+        feed_names_argu += "{}\n".format(n)
+        pp_green("feed name - {} ".format(str(n)), 1)
+        push(feed_path + "/" + str(n), "{}".format(str(n)))
+
+    push(feed_path + "/" + last_feed_file_name, "input.txt")
+    push(mobile_src_root + "/build/release/{}/build/libpaddle-mobile.so".format(architecture))
+    push(mobile_src_root + "/build/release/{}/build/cl_kernel".format(architecture))
+    push(mobile_src_root + "/test/build/test-net")
+    last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
+    args = str(len(last_feed_var_shape))
+    for dim in last_feed_var_shape:
+        args += " " + str(dim)
+    if is_lod:
+        args += " 1"
+        args += " " + str(len(last_feed_var_lod))
+        for dim in last_feed_var_lod:
+            args += " " + str(dim)
+    else:
+        args += " 0"
+    args += " " + str(len(output_var_cache))
+    args += " " + str(1 if is_sample_step else 0)
+    if is_sample_step:
+        args += " " + str(sample_step)
+    else:
+        args += " " + str(sample_num)
+    for var_name in output_var_cache.keys():
+        args += " " + var_name
+    args += " " + str(1 if check_shape else 0)
+    if not fast_check:
+        check_mobile_results(args, False, False)
+        check_mobile_results(args, False, True)
+    check_mobile_results(args, True, False)
+    check_mobile_results(args, True, True)
+
+if __name__ == "__main__":
+    main()